From 78ed8f57d8815cdd5567533f7d3e25b959d861ab Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Thu, 12 Dec 2024 16:57:40 -0800 Subject: [PATCH 01/56] [Misc][V1] Fix type in v1 prefix caching (#11151) --- tests/v1/core/test_prefix_caching.py | 12 ++++++++---- vllm/v1/core/kv_cache_manager.py | 8 ++++---- vllm/v1/core/kv_cache_utils.py | 22 +++++++++++++++------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index b44d3e5cb0678..00f7b0fcfe1dc 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -49,7 +49,7 @@ def test_prefill(): block_hash = hash_block_tokens(parent_block_hash, block_tokens) assert manager.block_pool[block_id].block_hash == block_hash assert manager.block_pool[block_id].ref_cnt == 1 - parent_block_hash = block_hash + parent_block_hash = block_hash.hash_value # Check partial/preallocated block metadata for block_id in (3, 4): @@ -360,11 +360,15 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): assert not computed_blocks # Just ask for 1 block. blocks = manager.allocate_slots(req, block_size, computed_blocks) + req.num_computed_tokens = block_size assert len(blocks) == 1 + num_preallocated_blocks - # Append slots to the block. - req.num_computed_tokens = block_size * len(blocks) # Assume all used. - blocks = manager.append_slots(req, block_size) # Append 1 block. + # Assume all computed. + manager.append_slots(req, block_size * (len(blocks) - 1)) + req.num_computed_tokens = block_size * len(blocks) + + # Append 1 block. + blocks = manager.append_slots(req, block_size) assert len(blocks) == 1 + num_preallocated_blocks diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b492a755e6dd5..03cbb958237df 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -375,8 +375,8 @@ def _cache_full_blocks( prev_block: The previous block in the chain. """ # Update the new blocks with the block hashes through the chain. - prev_block_hash = (prev_block.block_hash - if prev_block is not None else None) + prev_block_hash_value = (prev_block.block_hash.hash_value + if prev_block is not None else None) for i, blk in enumerate(full_blocks): blk_idx = blk_start_idx + i @@ -390,10 +390,10 @@ def _cache_full_blocks( f"{request.request_id}({request})") # Compute the hash of the current block. - block_hash = hash_block_tokens(prev_block_hash, + block_hash = hash_block_tokens(prev_block_hash_value, tuple(block_tokens)) # Update and added the full block to the cache. blk.block_hash = block_hash self.cached_block_hash_to_block[block_hash][blk.block_id] = blk - prev_block_hash = block_hash + prev_block_hash_value = block_hash.hash_value diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index fb666c364bfb2..814e462a91fed 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,12 +1,19 @@ """KV-Cache Utilities.""" from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import List, NamedTuple, Optional, Tuple from vllm.logger import init_logger logger = init_logger(__name__) -BlockHashType = Tuple[int, Tuple[int]] + +class BlockHashType(NamedTuple): + """Hash value of a block and the token IDs in the block. + The reason we keep a tuple of token IDs is to make sure no hash + collision happens when the hash value is the same. + """ + hash_value: int + token_ids: Tuple[int] @dataclass @@ -171,8 +178,8 @@ def hash_block_tokens(parent_block_hash: Optional[int], The hash value of the block and the token ids in the block. The entire tuple is used as the hash key of the block. """ - return (hash( - (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids) + return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)), + curr_block_token_ids) def hash_request_tokens(block_size: int, @@ -188,14 +195,15 @@ def hash_request_tokens(block_size: int, The list of computed hash values. """ ret = [] - parent_block_hash = None + parent_block_hash_value = None for start in range(0, len(token_ids), block_size): end = start + block_size block_token_ids = tuple(token_ids[start:end]) # Do not hash the block if it is not full. if len(block_token_ids) < block_size: break - block_hash = hash_block_tokens(parent_block_hash, block_token_ids) + block_hash = hash_block_tokens(parent_block_hash_value, + block_token_ids) ret.append(block_hash) - parent_block_hash = block_hash + parent_block_hash_value = block_hash.hash_value return ret From 30870b4f66414020645608b81dced94d8a99111c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Thu, 12 Dec 2024 22:19:23 -0500 Subject: [PATCH 02/56] [torch.compile] Dynamic fp8 + rms_norm fusion (#10906) Signed-off-by: luka Co-authored-by: Varun Sundar Rabindranath --- CMakeLists.txt | 3 +- .../fused_kernels/layernorm_rms_benchmarks.py | 173 +++++ csrc/dispatch_utils.h | 14 + csrc/ops.h | 8 + csrc/quantization/fp8/common.cuh | 26 +- ...fused_layernorm_dynamic_per_token_quant.cu | 160 ++++ .../fused_kernels/layernorm_utils.cuh | 327 ++++++++ .../fused_kernels/quant_conversions.cuh | 81 ++ csrc/quantization/vectorization.cuh | 33 + csrc/torch_bindings.cpp | 8 + tests/compile/test_functionalization.py | 21 +- tests/compile/test_fusion.py | 61 +- tests/kernels/test_fused_quant_layernorm.py | 171 +++++ vllm/_custom_ops.py | 20 + vllm/compilation/fix_functionalization.py | 9 +- vllm/compilation/fusion.py | 719 +++++++++++++----- vllm/compilation/fx_utils.py | 42 + vllm/compilation/multi_output_match.py | 105 +++ vllm/compilation/reshapes.py | 3 +- vllm/compilation/vllm_inductor_pass.py | 4 - 20 files changed, 1736 insertions(+), 252 deletions(-) create mode 100644 benchmarks/fused_kernels/layernorm_rms_benchmarks.py create mode 100644 csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu create mode 100644 csrc/quantization/fused_kernels/layernorm_utils.cuh create mode 100644 csrc/quantization/fused_kernels/quant_conversions.cuh create mode 100644 csrc/quantization/vectorization.cuh create mode 100644 tests/kernels/test_fused_quant_layernorm.py create mode 100644 vllm/compilation/fx_utils.py create mode 100644 vllm/compilation/multi_output_match.py diff --git a/CMakeLists.txt b/CMakeLists.txt index c78cdc77a7e42..bf19b3d227171 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,7 @@ set(VLLM_EXT_SRC "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" + "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" @@ -300,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. - cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS + cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py new file mode 100644 index 0000000000000..ef91f9f8eb529 --- /dev/null +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -0,0 +1,173 @@ +import pickle as pkl +import time +from dataclasses import dataclass +from itertools import product +from typing import Callable, Iterable, List, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from tqdm import tqdm + +import vllm._custom_ops as ops +from vllm.model_executor.layers.layernorm import RMSNorm + + +@dataclass +class bench_params_t: + num_tokens: int + hidden_size: int + add_residual: bool + dtype: torch.dtype + + def description(self): + return (f'N {self.num_tokens} ' + f'x D {self.hidden_size} ' + f'x R {self.add_residual} ' + f'x DT {self.dtype}') + + +def get_bench_params() -> List[bench_params_t]: + ## Test Fixtures + NUM_TOKENS = [2**x for x in range(11)] + HIDDEN_SIZES = list(range(1024, 8129, 1024)) + ADD_RESIDUAL = [True, False] + DTYPES = [torch.bfloat16, torch.float] + + combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES) + bench_params = list(map(lambda x: \ + bench_params_t(x[0], x[1], x[2], x[3]), combinations)) + return bench_params + + +# Reference impls +def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype): + # Norm + torch_out = None + if residual is None: + torch_out = rms_norm_layer.forward_cuda(x, residual) + else: + torch_out, _ = rms_norm_layer.forward_cuda(x, residual) + + # Quant + torch_out, _, _ = ops.scaled_int8_quant(torch_out) + + +def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype): + # Norm + torch_out = None + if residual is None: + torch_out = rms_norm_layer.forward_cuda(x, residual) + else: + torch_out, _ = rms_norm_layer.forward_cuda(x, residual) + + # Quant + torch_out, _ = ops.scaled_fp8_quant(torch_out) + + +def fused_impl( + rms_norm_layer: RMSNorm, # this stores the weights + x: torch.Tensor, + residual: Optional[torch.Tensor], + quant_dtype: torch.dtype): + out, _ = ops.rms_norm_dynamic_per_token_quant(x, + rms_norm_layer.weight, + 1e-6, + quant_dtype, + residual=residual) + + +# Bench functions +def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor, + quant_dtype: torch.dtype, label: str, sub_label: str, + fn: Callable, description: str) -> TMeasurement: + + min_run_time = 1 + + globals = { + "rms_norm_layer": rms_norm_layer, + "x": x, + "residual": residual, + "quant_dtype": quant_dtype, + "fn": fn, + } + return TBenchmark.Timer( + stmt="fn(rms_norm_layer, x, residual, quant_dtype)", + globals=globals, + label=label, + sub_label=sub_label, + description=description, + ).blocked_autorange(min_run_time=min_run_time) + +def bench(params: bench_params_t, label: str, sub_label: str) \ + -> Iterable[TMeasurement]: + + # Make inputs + layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype) + # Make weights + layer.weight.data.normal_(mean=1.0, std=0.1) + # Make inputs + scale = 1 / params.hidden_size + x = torch.randn(params.num_tokens, + params.hidden_size, + dtype=params.dtype, + device='cuda') * scale + residual = (torch.randn_like(x) * scale).to(device='cuda') \ + if params.add_residual else None + + timers = [] + + # unfused int8 impl. + timers.append( + bench_fn(layer, x, residual, torch.int8, label, sub_label, + unfused_int8_impl, "unfused_int8_impl")) + + # unfused fp8 impl. + timers.append( + bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label, + unfused_fp8_impl, "unfused_fp8_impl")) + + # fused int8 impl. + timers.append( + bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl, + "fused_int8_impl")) + + # fused fp8 impl. + timers.append( + bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label, + fused_impl, "fused_fp8_impl")) + + print_timers(timers) + + return timers + + +# launch bench +# runner +def print_timers(timers: Iterable[TMeasurement]): + compare = TBenchmark.Compare(timers) + compare.print() + + +def main(): + torch.set_default_device('cuda') + bench_params = get_bench_params() + + timers = [] + for bp in tqdm(bench_params): + timers.extend( + bench(bp, "rms-norm-dynamic-per-token-quant", bp.description())) + print_timers(timers) + + # pickle all the results + timestamp = int(time.time()) + with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f: + pkl.dump(timers, f) + + +if __name__ == '__main__': + main() diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index a634e1c3d4886..03414b7e1ae93 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -14,6 +14,20 @@ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +// TODO(luka/varun): use FP8_TYPE macro after refactoring +#ifndef USE_ROCM + #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) +#else + #define VLLM_DISPATCH_CASE_QUANT_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) +#endif + +#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__)) + #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ diff --git a/csrc/ops.h b/csrc/ops.h index ea001190bc202..816b471d062d2 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -66,6 +66,14 @@ void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& weight, torch::Tensor& scale, double epsilon); +void rms_norm_dynamic_per_token_quant(torch::Tensor& out, + torch::Tensor const& input, + torch::Tensor const& weight, + torch::Tensor& scales, + double const epsilon, + std::optional scale_ub, + std::optional residual); + void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh index d7c0297d5333f..15bd5b6ed1564 100644 --- a/csrc/quantization/fp8/common.cuh +++ b/csrc/quantization/fp8/common.cuh @@ -1,6 +1,9 @@ #pragma once +#include "quantization/vectorization.cuh" + #include +#include #ifndef USE_ROCM #include @@ -15,6 +18,7 @@ using FP8_TYPE = c10::Float8_e4m3fnuz; // issue when running dynamic quantization. Here use 224.0f for rocm. constexpr auto FP8_E4M3_MAX = 224.0f; #endif +constexpr static auto kFp8Type = c10::CppTypeToScalarType::value; namespace vllm { @@ -89,22 +93,6 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, } } -template -struct __align__(8) vec4_t { - scalar_t x; - scalar_t y; - scalar_t z; - scalar_t w; -}; - -typedef struct __align__(4) { - FP8_TYPE x; - FP8_TYPE y; - FP8_TYPE z; - FP8_TYPE w; -} -float8x4_t; - template __device__ float thread_max_vec(scalar_t const* __restrict__ input, int64_t const num_elems, int const tid, @@ -139,10 +127,10 @@ __device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out, float const scale, int64_t const num_elems, int const tid, int const step) { + using float8x4_t = q8x4_t; // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); - float8x4_t* vectorized_out = reinterpret_cast(out); + auto const* vectorized_in = reinterpret_cast const*>(input); + auto* vectorized_out = reinterpret_cast(out); int64_t const num_vec_elems = num_elems >> 2; diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu new file mode 100644 index 0000000000000..3c4f183bf4b59 --- /dev/null +++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu @@ -0,0 +1,160 @@ + +#include +#include + +#include "../../dispatch_utils.h" +#include "layernorm_utils.cuh" +#include "quant_conversions.cuh" + +namespace vllm { + +template +__device__ void rms_norm_dynamic_per_token_quant_vec( + scalar_out_t* __restrict__ out, // [..., hidden_size] + float* __restrict__ scales, // [num_tokens] + scalar_t const* __restrict__ input, // [..., hidden_size] + scalar_t const* __restrict__ weight, // [hidden_size] + float const* scale_ub, float const var_epsilon, + float const min_scaling_factor, int32_t const hidden_size, + scalar_t* __restrict__ residual = nullptr) { + float rms = 0.0f; + float token_scale = 0.0f; + + // Compute rms + vllm::vectorized::compute_rms( + &rms, input, hidden_size, var_epsilon, residual); + + // Compute scale + vllm::vectorized::compute_dynamic_per_token_scales( + &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor, + hidden_size, residual); + + // RMS Norm + Quant + if constexpr (std::is_same_v) { + vllm::vectorized::norm_and_quant( + out, input, weight, rms, 1.0f / token_scale, hidden_size, residual); + } else { + // FP8 - Do not invert token_scale for exact match with FBGemm + vllm::vectorized::norm_and_quant( + out, input, weight, rms, token_scale, hidden_size, residual); + } +} + +// RMS norm + quant kernel +template +__global__ void rms_norm_dynamic_per_token_quant_kernel( + scalar_out_t* __restrict__ out, // [..., hidden_size] + float* __restrict__ scales, // [num_tokens] + scalar_t const* __restrict__ input, // [..., hidden_size] + scalar_t const* __restrict__ weight, // [hidden_size] + float const* scale_ub, float const var_epsilon, + float const min_scaling_factor, int32_t const hidden_size, + scalar_t* __restrict__ residual = nullptr) { + // For vectorization, token_input and token_output pointers need to be + // aligned at 8-byte and 4-byte addresses respectively. + bool const can_vectorize = hidden_size % 4 == 0; + + if (can_vectorize) { + return rms_norm_dynamic_per_token_quant_vec( + out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor, + hidden_size, residual); + } + + float rms = 0.0f; + float token_scale = 0.0f; + + // Compute RMS + vllm::compute_rms(&rms, input, hidden_size, + var_epsilon, residual); + // Compute Scale + vllm::compute_dynamic_per_token_scales( + &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor, + hidden_size, residual); + + // RMS Norm + Quant + if constexpr (std::is_same_v) { + vllm::norm_and_quant( + out, input, weight, rms, 1.0f / token_scale, hidden_size, residual); + } else { + // FP8 - Do not invert s_token_scale for exact match with FBGemm + vllm::norm_and_quant( + out, input, weight, rms, token_scale, hidden_size, residual); + } +} +} // namespace vllm + +// Residual add + RMS norm + dynamic per token +template +void rms_norm_dynamic_per_token_quant_dispatch( + torch::Tensor& out, // [..., hidden_size] + torch::Tensor const& input, // [..., hidden_size] + torch::Tensor const& weight, // [hidden_size] + torch::Tensor& scales, // [num_tokens] + double const var_epsilon, // Variance epsilon used in norm calculation + std::optional const& scale_ub, + std::optional& residual) { + int32_t hidden_size = input.size(-1); + int32_t num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + const float min_scaling_factor = + out.dtype() == torch::kInt8 + ? std::numeric_limits::epsilon() + : 1.0f / (std::numeric_limits::max() * 512.f); + + if (residual.has_value()) { + VLLM_DISPATCH_QUANT_TYPES( + out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] { + vllm::rms_norm_dynamic_per_token_quant_kernel + <<>>( + out.data_ptr(), scales.data_ptr(), + input.data_ptr(), weight.data_ptr(), + scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, + var_epsilon, min_scaling_factor, hidden_size, + residual->data_ptr()); + }); + + } else { + VLLM_DISPATCH_QUANT_TYPES( + out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] { + vllm::rms_norm_dynamic_per_token_quant_kernel + <<>>( + out.data_ptr(), scales.data_ptr(), + input.data_ptr(), weight.data_ptr(), + scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, + var_epsilon, min_scaling_factor, hidden_size, nullptr); + }); + } +} + +void rms_norm_dynamic_per_token_quant( + torch::Tensor& out, // [..., hidden_size] + torch::Tensor const& input, // [..., hidden_size] + torch::Tensor const& weight, // [hidden_size] + torch::Tensor& scales, // [num_tokens] + double const var_epsilon, // Variance epsilon used in norm calculation + std::optional scale_ub, std::optional residual) { + TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8); + TORCH_CHECK(out.is_contiguous() && input.is_contiguous()); + + if (scale_ub.has_value()) { + TORCH_CHECK(out.dtype() == kFp8Type); + } + TORCH_CHECK(scales.dtype() == torch::kFloat32); + + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] { + rms_norm_dynamic_per_token_quant_dispatch( + out, input, weight, scales, var_epsilon, scale_ub, residual); + }); +} diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh new file mode 100644 index 0000000000000..cec6b54edb569 --- /dev/null +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -0,0 +1,327 @@ +#pragma once + +/** + * __device__ layernorm utilities. + */ + +#include "quantization/vectorization.cuh" +#include "quant_conversions.cuh" + +#ifndef USE_ROCM + #include +#else + #include +#endif + +namespace vllm { + +// has_residual must be true, if residual is not a nullptr +template +__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, + int32_t const hidden_size, float const epsilon, + scalar_t const* __restrict__ residual = nullptr) { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + // sum of squares + float ss = 0.0f; + + for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) { + float x = static_cast(input[token_offset + i]); + if constexpr (has_residual) { + x += static_cast(residual[token_offset + i]); + } + + ss += x * x; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + + __shared__ float s_rms; + if (threadIdx.x == 0) { + s_rms = rsqrtf(ss / hidden_size + epsilon); + } + __syncthreads(); + + *rms = s_rms; +} + +template +__device__ void compute_dynamic_per_token_scales( + float* __restrict__ token_scale, float* __restrict__ all_token_scales, + scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, + float const rms, float const* __restrict__ scale_ub, + float const min_scaling_factor, int32_t const hidden_size, + scalar_t const* __restrict__ residual = nullptr) { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + ; + constexpr scalar_out_t qmax{std::numeric_limits::max()}; + + float block_absmax_val_maybe = 0.0f; + for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) { + float x = static_cast(input[token_offset + i]); + if constexpr (has_residual) { + x += static_cast(residual[token_offset + i]); + } + + x = static_cast(static_cast(x * rms) * weight[i]); + block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x)); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + block_absmax_val_maybe = + BlockReduce(reduceStore) + .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x); + + __shared__ float s_token_scale; + if (threadIdx.x == 0) { + float scale = 0.0f; + if (scale_ub) { + scale = min(block_absmax_val_maybe, *scale_ub); + } else { + scale = block_absmax_val_maybe; + } + // token scale computation + scale = max(scale / qmax, min_scaling_factor); + s_token_scale = scale; // Shared memory store + all_token_scales[blockIdx.x] = scale; // Global output store + } + __syncthreads(); + + *token_scale = s_token_scale; +} + +template +__device__ void norm_and_quant(scalar_out_t* __restrict__ output, + scalar_t const* __restrict__ input, + scalar_t const* __restrict__ weight, + float const rms, float const scale, + int32_t const hidden_size, + scalar_t* __restrict__ residual = nullptr) { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + ; + + for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) { + float x = static_cast(input[token_offset + i]); + if constexpr (has_residual) { + x += static_cast(residual[token_offset + i]); + residual[token_offset + i] = static_cast(x); + } + // Norm + x = static_cast(static_cast(x * rms) * weight[i]); + // Quant + output[token_offset + i] = + ScaledQuant::quant_fn(x, scale); + } +} + +namespace vectorized { + +// Compute 1.0/rms(input) +// hidden_size must be a multiple of 4 +template +__device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, + int32_t const hidden_size, float const epsilon, + scalar_t const* __restrict__ residual = nullptr) { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + + // Vectorized input/output to better utilize memory bandwidth. + vec4_t const* vec_input = + reinterpret_cast const*>(&input[token_offset]); + vec4_t const* vec_residual = nullptr; + if constexpr (has_residual) { + vec_residual = + reinterpret_cast const*>(&residual[token_offset]); + } + + // sum of squares + float ss = 0.0f; + + int32_t const num_vec_elems = hidden_size >> 2; + +#pragma unroll 4 + for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + vec4_t in = vec_input[i]; + + vec4_t x; + x.x = static_cast(in.x); + x.y = static_cast(in.y); + x.z = static_cast(in.z); + x.w = static_cast(in.w); + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; + x.x += static_cast(r.x); + x.y += static_cast(r.y); + x.z += static_cast(r.z); + x.w += static_cast(r.w); + } + + ss += x.x * x.x; + ss += x.y * x.y; + ss += x.z * x.z; + ss += x.w * x.w; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + ss = BlockReduce(reduceStore).Reduce(ss, cub::Sum{}, blockDim.x); + + __shared__ float s_rms; + if (threadIdx.x == 0) { + s_rms = rsqrtf(ss / hidden_size + epsilon); + } + __syncthreads(); + + *rms = s_rms; +} + +// Vectorized version of vllm::compute_dynamic_per_token_scales +// hidden_size must be a multiple of 4 +template +__device__ void compute_dynamic_per_token_scales( + float* __restrict__ token_scale, float* __restrict__ all_token_scales, + scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, + float const rms, float const* __restrict__ scale_ub, + float const min_scaling_factor, int32_t const hidden_size, + scalar_t const* __restrict__ residual = nullptr) { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + ; + + // Vectorized input/weight/residual to better utilize memory bandwidth. + vec4_t const* vec_input = + reinterpret_cast const*>(&input[token_offset]); + vec4_t const* vec_weight = + reinterpret_cast const*>(weight); + vec4_t const* vec_residual = nullptr; + if constexpr (has_residual) { + vec_residual = + reinterpret_cast const*>(&residual[token_offset]); + } + + constexpr scalar_out_t qmax{std::numeric_limits::max()}; + + int32_t const num_vec_elems = hidden_size >> 2; + float block_absmax_val_maybe = 0.0f; + +#pragma unroll 4 + for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + vec4_t in = vec_input[i]; + vec4_t const w = vec_weight[i]; + + vec4_t x; + x.x = static_cast(in.x); + x.y = static_cast(in.y); + x.z = static_cast(in.z); + x.w = static_cast(in.w); + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; + x.x += static_cast(r.x); + x.y += static_cast(r.y); + x.z += static_cast(r.z); + x.w += static_cast(r.w); + } + + block_absmax_val_maybe = fmaxf( + block_absmax_val_maybe, fabs(static_cast(x.x * rms) * w.x)); + block_absmax_val_maybe = fmaxf( + block_absmax_val_maybe, fabs(static_cast(x.y * rms) * w.y)); + block_absmax_val_maybe = fmaxf( + block_absmax_val_maybe, fabs(static_cast(x.z * rms) * w.z)); + block_absmax_val_maybe = fmaxf( + block_absmax_val_maybe, fabs(static_cast(x.w * rms) * w.w)); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + block_absmax_val_maybe = + BlockReduce(reduceStore) + .Reduce(block_absmax_val_maybe, cub::Max{}, blockDim.x); + + __shared__ float s_token_scale; + if (threadIdx.x == 0) { + float scale = 0.0f; + if (scale_ub) { + scale = min(block_absmax_val_maybe, *scale_ub); + } else { + scale = block_absmax_val_maybe; + } + // token scale computation + scale = max(scale / qmax, min_scaling_factor); + s_token_scale = scale; // shared memory store + all_token_scales[blockIdx.x] = scale; // global output store + } + __syncthreads(); + + *token_scale = s_token_scale; +} + +// hidden_size must be a multiple of 4 +template +__device__ void norm_and_quant(scalar_out_t* __restrict__ output, + scalar_t const* __restrict__ input, + scalar_t const* __restrict__ weight, + float const rms, float const scale, + int32_t const hidden_size, + scalar_t* __restrict__ residual = nullptr) { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + ; + + // Vectorized input/output/weight/residual to better utilize memory bandwidth. + vec4_t const* vec_input = + reinterpret_cast const*>(&input[token_offset]); + vec4_t const* vec_weight = + reinterpret_cast const*>(weight); + q8x4_t* vec_output = + reinterpret_cast*>(&output[token_offset]); + vec4_t* vec_residual = nullptr; + if constexpr (has_residual) { + vec_residual = reinterpret_cast*>(&residual[token_offset]); + } + + int32_t const num_vec_elems = hidden_size >> 2; + +// TODO(luka/varun) extract into type-agnostic vectorized quant function to +// replace scaled_fp8_conversion_vec +#pragma unroll 4 + for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + vec4_t const in = vec_input[i]; + vec4_t const w = vec_weight[i]; + + vec4_t x; + x.x = static_cast(in.x); + x.y = static_cast(in.y); + x.z = static_cast(in.z); + x.w = static_cast(in.w); + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; + x.x += static_cast(r.x); + x.y += static_cast(r.y); + x.z += static_cast(r.z); + x.w += static_cast(r.w); + // Update residual + r.x = static_cast(x.x); + r.y = static_cast(x.y); + r.z = static_cast(x.z); + r.w = static_cast(x.w); + vec_residual[i] = r; + } + + q8x4_t out; + out.x = ScaledQuant::quant_fn( + static_cast(x.x * rms) * w.x, scale); + out.y = ScaledQuant::quant_fn( + static_cast(x.y * rms) * w.y, scale); + out.z = ScaledQuant::quant_fn( + static_cast(x.z * rms) * w.z, scale); + out.w = ScaledQuant::quant_fn( + static_cast(x.w * rms) * w.w, scale); + vec_output[i] = out; + } +} + +} // namespace vectorized + +} // namespace vllm diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh new file mode 100644 index 0000000000000..f8a9872226a3a --- /dev/null +++ b/csrc/quantization/fused_kernels/quant_conversions.cuh @@ -0,0 +1,81 @@ +#pragma once + +/** + * __device__ helper functions to deal with float -> quant datatype conversion + */ + +#include "quantization/vectorization.cuh" +// TODO(luka/varun):refactor common.cuh to use this file instead +#include "quantization/fp8/common.cuh" + +namespace vllm { + +// TODO(luka/varun): combine into common utilities for int8 +// (with int8_quant_kernels.cu) +static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) { +#ifdef USE_ROCM + static const float i8_min = + static_cast(std::numeric_limits::min()); + static const float i8_max = + static_cast(std::numeric_limits::max()); + // round + float dst = std::nearbyint(x); + // saturate + dst = std::clamp(dst, i8_min, i8_max); + return static_cast(dst); +#else + // CUDA path + uint32_t dst; + asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x)); + return reinterpret_cast(dst); +#endif +} + +static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) { + float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); + return static_cast(r); +} + +template +struct ScaledQuant; + +template +struct ScaledQuant< + quant_type_t, is_scale_inverted, + typename std::enable_if_t>> { + static __device__ __forceinline__ quant_type_t quant_fn(float const x, + float const scale) { + if constexpr (is_scale_inverted) { + return float_to_int8_rn(x * scale); + } else { + return float_to_int8_rn(x / scale); + } + } +}; + +template +struct ScaledQuant< + quant_type_t, is_scale_inverted, + typename std::enable_if_t>> { + static __device__ __forceinline__ quant_type_t quant_fn(float const x, + float const scale) { + if constexpr (is_scale_inverted) { + return float_to_fp8(x * scale); + } else { + return float_to_fp8(x / scale); + } + } +}; + +template +__device__ void scaled_quant_conversion(quant_type_t* __restrict__ output, + scalar_t const* __restrict__ input, + float const scale, int const tid, + int const num_elements, + int const step) { + for (int i = tid; i < num_elements; i += step) { + output[i] = ScaledQuant(input[i], scale); + } +} + +} // namespace vllm diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh new file mode 100644 index 0000000000000..44c999130f756 --- /dev/null +++ b/csrc/quantization/vectorization.cuh @@ -0,0 +1,33 @@ +#pragma once +/** + * __device__ datatypes vectorized by 4 + */ + +// Include both AMD and NVIDIA fp8 types to avoid circular import +// TODO(luka/varun) use FP8_TYPE instead after refactoring +#include +#include + +namespace vllm { + +// Vectorization containers +template +struct __align__(8) vec4_t { + scalar_t x; + scalar_t y; + scalar_t z; + scalar_t w; +}; + +template +struct __align__(4) q8x4_t { + static_assert(std::is_same_v || + std::is_same_v || + std::is_same_v); + quant_type_t x; + quant_type_t y; + quant_type_t z; + quant_type_t w; +}; + +} // namespace vllm diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 4e64b9c92773a..1ffab14862fed 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -128,6 +128,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA, &fused_add_rms_norm_static_fp8_quant); + // Fused Layernorm + Quant kernels + ops.def( + "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, " + "Tensor weight, Tensor! scale, float epsilon, " + "Tensor? scale_ub, Tensor!? residual) -> ()"); + ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA, + &rms_norm_dynamic_per_token_quant); + // Rotary embedding // Apply GPT-NeoX or GPT-J style rotary embedding to query and key. ops.def( diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 5036189077be2..ea3aaee9565ec 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -4,10 +4,10 @@ import vllm.envs as envs from vllm import LLM, SamplingParams from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fusion import (FusionPass, find_auto_fn, - find_auto_fn_maybe) +from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey, + kFp8DynamicTokenSym, kFp8StaticTensorSym) +from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.reshapes import RedundantReshapesPass -from vllm.compilation.vllm_inductor_pass import is_func from vllm.config import CompilationConfig from .backend import TestBackend @@ -35,12 +35,16 @@ ] -@pytest.mark.parametrize("model", - ["nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"]) +@pytest.mark.parametrize( + "model, quant_key", + [("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e", kFp8StaticTensorSym), + ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8_DYNAMIC-e2e", + kFp8DynamicTokenSym)]) @pytest.mark.parametrize("do_fusion", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", reason="Only test on CUDA") -def test_fix_functionalization(model: str, do_fusion: bool): +def test_fix_functionalization(model: str, quant_key: QuantKey, + do_fusion: bool): torch.set_default_device("cuda") config = CompilationConfig.PassConfig(enable_fusion=do_fusion, @@ -78,8 +82,9 @@ def test_fix_functionalization(model: str, do_fusion: bool): # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion, # and replaced by fused quantized ops in RMS_QUANT_OPS. - ops = OPS_IN_MODEL + (RMS_QUANT_OPS["static_fp8"] - if do_fusion else [RMS_OP]) + rms_ops = [FUSED_OPS[(quant_key, True)], FUSED_OPS[(quant_key, False)] + ] if do_fusion else [RMS_OP] + ops = OPS_IN_MODEL + rms_ops for op in ops: find_auto_fn(backend_no_func.graph_post_pass.nodes, op) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index f92ec8d0de5f1..b4266a4a7db94 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -3,8 +3,9 @@ from compressed_tensors.quantization import FP8_DTYPE import vllm.envs as envs -from vllm.compilation.fusion import (FusionPass, find_auto_fn, - find_auto_fn_maybe) +from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey, + FusionPass, QuantKey) +from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe from vllm.compilation.reshapes import RedundantReshapesPass from vllm.config import CompilationConfig from vllm.model_executor.layers.layernorm import RMSNorm @@ -16,24 +17,37 @@ class TestModel(torch.nn.Module): - def __init__(self, hidden_size: int, eps: float, *args, **kwargs): + def __init__(self, hidden_size: int, eps: float, static: bool, *args, + **kwargs): super().__init__(*args, **kwargs) self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] - self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)] + self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] + if static: + self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] + else: + self.scale = [None for _ in range(2)] self.w = [ torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() for _ in range(2) ] def forward(self, x): - resid = torch.relu(x) + resid = torch.sqrt(x) y = self.norm[0](x) - x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1]) + x2 = apply_fp8_linear(y, + self.w[0], + self.wscale[0], + self.scale[0], + use_per_token_if_dynamic=True) # make sure resid is used for replacement to work y2, resid = self.norm[1](x2, resid) - x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3]) + x3 = apply_fp8_linear(y2, + self.w[1], + self.wscale[1], + self.scale[1], + use_per_token_if_dynamic=True) y3, resid = self.norm[2](x3, resid) # use resid here return y3 @@ -42,14 +56,13 @@ def forward(self, x): @pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) +@pytest.mark.parametrize("static", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", reason="Only test on CUDA") -def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps): +def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static): torch.set_default_device("cuda") - torch.set_default_dtype(torch.float16) - - if eps != 1e-5: - pytest.skip("Only test eps=1e-5 for now") + torch.set_default_dtype(dtype) + torch.manual_seed(1) # Reshape pass is needed for the fusion pass to work config = CompilationConfig.PassConfig(enable_fusion=True, @@ -58,7 +71,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps): fusion_pass = FusionPass.instance(config) backend = TestBackend(reshape_pass, fusion_pass) - model = TestModel(hidden_size, eps) + model = TestModel(hidden_size, eps, static) # First dimension dynamic x = torch.rand(num_tokens, hidden_size) @@ -69,16 +82,28 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps): model2 = torch.compile(model, backend=backend) result2 = model2(x) - # Check that it gives the same answer - torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3) + # Higher tol for dynamic, even higher for bfloat16 + if static: + ATOL, RTOL = (1e-3, 1e-3) + elif dtype == torch.float16: + ATOL, RTOL = (2e-3, 2e-3) + else: + ATOL, RTOL = (1e-2, 1e-2) + + torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) # Check substitution worked pre_nodes = backend.graph_pre_pass.nodes post_nodes = backend.graph_post_pass.nodes - rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default - add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default - fp8_quant = torch.ops._C.static_scaled_fp8_quant.default + # static is per-tensor, dynamic is per-token + key = QuantKey(dtype=FP8_DTYPE, + static=static, + per_tensor=static, + symmetric=True) + rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)] + add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)] + fp8_quant = QUANT_OPS[key] # In pre-nodes, fp8 quant should be present and fused kernels should not assert find_auto_fn_maybe(pre_nodes, rms_quant) is None diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py new file mode 100644 index 0000000000000..baf8d73fdbffb --- /dev/null +++ b/tests/kernels/test_fused_quant_layernorm.py @@ -0,0 +1,171 @@ +from typing import Optional, Tuple, Union + +import pytest +import torch + +import vllm._custom_ops as ops +from tests.kernels.utils import opcheck +from vllm.model_executor.layers.layernorm import RMSNorm + +DTYPES = [torch.bfloat16, torch.float] +QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn] +VEC_HIDDEN_SIZES = range(1024, 1030) +# Avoid combinatorial explosion with full Cartesian product +NUM_TOKENS_HIDDEN_SIZES = [ + *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]], + *[(83, i) for i in [1, 1033, 2048, 5120]], + *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]], + *[(4096, i) for i in [1, 64, 5137]], +] + +ADD_RESIDUAL = [False, True] +SCALE_UBS = [True, False] +SEEDS = [0] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +EPS = 1e-6 + +## Helpers + + +def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: + return torch.as_tensor(x, dtype=torch.float32, device='cuda') + + +def ref_rms_norm(rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: Optional[torch.Tensor]) \ + -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if residual is not None: + residual = residual.clone() + out, residual = rms_norm_layer.forward_native(x, residual) + else: + out = rms_norm_layer.forward_native(x) + + return out, residual + + +def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm, + x: torch.Tensor, + quant_dtype: torch.dtype, + residual: Optional[torch.Tensor], + scale_ub: Optional[torch.Tensor]) \ + -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + if scale_ub is not None: + assert quant_dtype == torch.float8_e4m3fn + + # Norm + torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual) + + # Quant + if quant_dtype == torch.float8_e4m3fn: + torch_out, scales = ops.scaled_fp8_quant(torch_out, + scale_ub=scale_ub, + use_per_token_if_dynamic=True) + else: + assert quant_dtype == torch.int8 + torch_out, scales = ops.scaled_int8_quant(torch_out) + + return torch_out, scales, residual + + +def ref_impl(rms_norm_layer: RMSNorm, + x: torch.Tensor, + quant_dtype: torch.dtype, + residual: Optional[torch.Tensor], + scale_ub: Optional[torch.Tensor]) \ + -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype, + residual, scale_ub) + + +def ops_dynamic_per_token_quant(weight: torch.Tensor, + x: torch.Tensor, + quant_dtype: torch.dtype, + residual: Optional[torch.Tensor], + scale_ub: Optional[torch.Tensor]) \ + -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + if residual is not None: + residual = residual.clone() + out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS, + quant_dtype, scale_ub, + residual) + return out, scales, residual + + +def ops_impl(weight: torch.Tensor, + x: torch.Tensor, + quant_dtype: torch.dtype, + residual: Optional[torch.Tensor], + scale_ub: Optional[torch.Tensor]) \ + -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, + scale_ub) + + +@pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES) +@pytest.mark.parametrize("add_residual", ADD_RESIDUAL) +@pytest.mark.parametrize("scale_ub", SCALE_UBS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("quant_dtype", QUANT_DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_rms_norm( + num_tokens: int, + hidden_size: int, + add_residual: bool, + scale_ub: bool, + dtype: torch.dtype, + quant_dtype: torch.dtype, + seed: int, + device: str, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + + if scale_ub is not None and quant_dtype != torch.float8_e4m3fn: + # skip + return + + layer = RMSNorm(hidden_size, EPS).to(dtype=dtype) + + # Make weights + layer.weight.data.normal_(mean=1.0, std=0.1) + + # Make inputs + scale = 1 / (hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale + residual = torch.randn_like(x) * scale if add_residual else None + if scale_ub is not None: + rms_x, _ = ref_rms_norm(layer, x, residual) + scale_ub = torch.mean(rms_x).to(dtype=torch.float32, device='cuda') + + ref_out, ref_scales, ref_residual = \ + ref_impl(layer, x, quant_dtype, residual, scale_ub) + ops_out, ops_scales, ops_residual = \ + ops_impl(layer.weight, x, quant_dtype, residual, scale_ub) + + assert ref_out.dtype == quant_dtype + assert ops_out.dtype == quant_dtype + assert torch.allclose(ref_scales, ops_scales) + if quant_dtype == torch.int8: + # big atol to account for round-off errors. + assert torch.allclose(ref_out, ops_out, atol=1) + else: + assert torch.allclose(ref_out.to(dtype=torch.float32), + ops_out.to(dtype=torch.float32)) + if add_residual: + assert torch.allclose(ref_residual, ops_residual) + + output = torch.empty_like(x, dtype=quant_dtype) + scales = torch.empty((x.numel() // x.shape[-1], 1), + device=x.device, + dtype=torch.float32) + + opcheck(torch.ops._C.rms_norm_dynamic_per_token_quant, + (output, x, layer.weight, scales, 1e-5, scale_ub, residual)) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index c192c9a7b0e4d..d6002630ee02c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -249,6 +249,26 @@ def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int, block_table_bound) +# fused quant layer norm ops +def rms_norm_dynamic_per_token_quant( + input: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, + scale_ub: Optional[torch.Tensor] = None, + residual: Optional[torch.Tensor] = None +) -> Tuple[torch.Tensor, torch.Tensor]: + output = torch.empty_like(input, dtype=quant_dtype) + scales = torch.empty((input.numel() // input.shape[-1], 1), + device=input.device, + dtype=torch.float32) + + torch.ops._C.rms_norm_dynamic_per_token_quant(output, input, weight, + scales, epsilon, scale_ub, + residual) + return output, scales + + # quantization ops # awq def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor, diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 3584cc3608caf..e15d7b315c50f 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -6,7 +6,8 @@ from vllm.logger import init_logger -from .vllm_inductor_pass import VllmInductorPass, is_func +from .fx_utils import is_func +from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) @@ -53,14 +54,16 @@ def __call__(self, graph: torch.fx.Graph): self.insert_defunctionalized(graph, node) self._remove(node) - # These 2 replacements avoid the most copies for LLaMa. + # rms_norm replacements avoid the most copies for LLaMa. elif at_target == torch.ops._C.fused_add_rms_norm.default: mutated_args = {1: 'input', 2: 'residual'} self.defunctionalize(graph, node, mutated_args) elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default: # noqa: E501 mutated_args = {1: 'result', 2: 'residual'} self.defunctionalize(graph, node, mutated_args) - + elif at_target == torch.ops._C.rms_norm_dynamic_per_token_quant.default: # noqa: E501 + mutated_args = {1: 'result', 2: 'scale', 3: 'residual'} + self.defunctionalize(graph, node, mutated_args) elif at_target in [ torch.ops._C.rms_norm.default, torch.ops._C.rms_norm_static_fp8_quant.default diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 5efa410fab6a0..cde27bd108212 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,129 +1,517 @@ -import operator -from typing import Iterable, List, Optional +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple import torch +import torch._inductor.pattern_matcher as pm +# TODO(luka) use vllm.utils once #10836 landed +from compressed_tensors.quantization import FP8_DTYPE +from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized -from torch._inductor.pattern_matcher import (Match, PatternMatcherPass, - fwd_only, register_replacement) +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch._ops import OpOverload from vllm.config import CompilationConfig from vllm.logger import init_logger -from .vllm_inductor_pass import VllmInductorPass, is_func +from .fx_utils import find_getitem_maybe +from .multi_output_match import MultiOutputMatch +from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) -def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor, - input: torch.Tensor, weight: torch.Tensor, - scale: torch.Tensor): - at1 = auto_functionalized(torch.ops._C.rms_norm.default, - result=result_rms, - input=input, - weight=weight, - epsilon=1e-5) - at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default, - result=result, - input=at1[1], - scale=scale) - - # result - return at2[1] - - -def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor, - input: torch.Tensor, weight: torch.Tensor, - scale: torch.Tensor): - at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default, - result=result, - input=input, - weight=weight, - scale=scale, - epsilon=1e-5) - - # result - return at[1] - - -def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor, - residual: torch.Tensor, weight: torch.Tensor, - scale: torch.Tensor): - at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default, - input=input, - residual=residual, - weight=weight, - epsilon=1e-5) - at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default, - result=result, - input=at[1], - scale=scale) - - # result, residual - return at1[1], at[2] - - -def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor, - residual: torch.Tensor, - weight: torch.Tensor, scale: torch.Tensor): - at = auto_functionalized( - torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, - result=result, - input=input, - residual=residual, - weight=weight, - scale=scale, - epsilon=1e-5) - # result, residual - return at[1], at[2] - - def empty_bf16(*args, **kwargs): return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda") -def empty_fp8(*args, **kwargs): - fp8 = torch.float8_e4m3fn - return torch.empty(*args, **kwargs, dtype=fp8, device="cuda") - - def empty_fp32(*args, **kwargs): return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda") -# Utilities for post-processing multi-output matches +RMS_OP = torch.ops._C.rms_norm.default +RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default + + +class QuantKey(NamedTuple): + """ + Named tuple for identifying the type of quantization. + dtype: quantized data type + static: static quantization if True, dynamic if False + per_tensor: per-tensor quantization if True, per-token if False + symmetric: symmetric if True, asymmetric if False + """ + dtype: torch.dtype + static: bool + per_tensor: bool = True + symmetric: bool = True + + def __str__(self): + return (f"QuantKey({'static' if self.static else 'dynamic'}," + f"{fx.graph.dtype_abbrs[self.dtype]}," + f"{'per_tensor' if self.per_tensor else 'per_token'}," + f"{'a' if not self.symmetric else ''}symmetric)") + + +kFp8StaticTensorSym = QuantKey(FP8_DTYPE, True, True, True) +kFp8DynamicTensorSym = QuantKey(FP8_DTYPE, False, True, True) +kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, False, False, True) + +QUANT_OPS: Dict[QuantKey, OpOverload] = { + kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa + kFp8DynamicTensorSym: + torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa + kFp8DynamicTokenSym: + torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa +} + + +class FusedRMSQuantKey(NamedTuple): + """ + Named tuple for identifying the type of RMSNorm + quant fusion. + quant: type of quantization + fused_add: does the op also perform the residual add + """ + quant: QuantKey + fused_add: bool + + def __str__(self): + return (f"FusedQuantKey({self.quant}, with" + f"{'' if self.fused_add else 'out'} residual)") + + +FUSED_OPS: Dict[FusedRMSQuantKey, OpOverload] = { + FusedRMSQuantKey(kFp8StaticTensorSym, False): + torch.ops._C.rms_norm_static_fp8_quant.default, # noqa + FusedRMSQuantKey(kFp8StaticTensorSym, True): + torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, # noqa + FusedRMSQuantKey(kFp8DynamicTokenSym, False): + torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa + FusedRMSQuantKey(kFp8DynamicTokenSym, True): + torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa +} + + +class QuantMultiOutputMatch(MultiOutputMatch): + + def __init__(self, match: pm.Match, quant_op, fused_op): + super().__init__(match) + assert isinstance(quant_op, OpOverload) + assert isinstance(fused_op, OpOverload) + self.QUANT_OP = quant_op # in-place quant op + self.FUSED_OP = fused_op # in-place fused quant op + + def insert_fused_node(self, fused_return_mapping: Dict[int, Tuple[fx.Node, + int]], + **kwargs): + """ + This utility function inserts an auto-functionalized node for FUSED_OP. + It also correctly sets its meta value and rebinds the users of the + unfused nodes to use the fused node instead. + + :param fused_return_mapping: A dictionary, mapping from getitem indices + of the fused node result to a tuple of the old node and a getitem index. + :param kwargs: kwargs that get directly forwarded to the auto_fn node + + Example: + If we want to replace this graph: + _, x1, x2 = auto_fn(op1) + _, y1, y2 = auto_fn(op2) + + with + _, x1, y2, x2 = auto_fn(FUSED_OP) + + we would call: + insert_fused_node({1: (op1_node, 1), 2: (op2_node, 2), 3: (op1_node, 2)} + + Note that the 0th element is None for auto-functionalized in-place ops. + Hence, others appear 1-indexed. + """ + fused_node = self.insert_auto_fn(self.FUSED_OP, kwargs) + indices = fused_return_mapping.keys() + getitem_nodes = self.insert_getitems(fused_node, indices) + + # Prepare the meta value, use a list so it's mutable + meta_val = [None] * (max(indices) + 1) + + # Iterate through elements of the tuple produced by fused_node + for idx, getitem_node in zip(indices, getitem_nodes): + old_node, old_idx = fused_return_mapping[idx] + + # If the old value was never used, the old_getitem might not exist + old_getitem = find_getitem_maybe(old_node, old_idx) + if old_getitem is not None: + # Rebind the users of match getitem nodes to use the new nodes. + # The old nodes will be removed by DCE at the end of the pass. + old_getitem.replace_all_uses_with(getitem_node) + getitem_node.meta["val"] = old_getitem.meta["val"] + + # Extract the appropriate meta value + # It is present even if the getitem node does not exist + meta_val[idx] = old_node.meta["val"][old_idx] + + # Fix the meta value on the new fused node + fused_node.meta["val"] = tuple(meta_val) + + +class RMSNormQuantPattern: + + def __init__(self, epsilon: float, key: FusedRMSQuantKey): + self.epsilon = epsilon + self.quant_dtype = key.quant.dtype + + assert key.quant in QUANT_OPS, \ + f"unsupported quantization scheme {key.quant}" + self.QUANT_OP = QUANT_OPS[key.quant] + + assert key in FUSED_OPS, \ + f"unsupported fused rmsnorm+quant op for {key}" + self.FUSED_OP = FUSED_OPS[key] + + +class RMSNormStaticQuantPattern(RMSNormQuantPattern): + + def __init__(self, + epsilon: float, + quant_dtype: torch.dtype, + symmetric=True): + fused_key = FusedRMSQuantKey(fused_add=False, + quant=QuantKey(dtype=quant_dtype, + static=True, + per_tensor=True, + symmetric=symmetric)) + super().__init__(epsilon, fused_key) + + def register(self, pm_pass: PatternMatcherPass): + # Cannot use methods, as the self argument affects tracing + def pattern(result: torch.Tensor, result_rms: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at1 = auto_functionalized(RMS_OP, + result=result_rms, + input=input, + weight=weight, + epsilon=self.epsilon) + at2 = auto_functionalized(self.QUANT_OP, + result=result, + input=at1[1], + scale=scale) + + # result + return at2[1] + + def replacement(result: torch.Tensor, result_rms: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon) + + # result + return at[1] + + inputs = [ + torch.empty(5, 4, device="cuda", dtype=self.quant_dtype), # result + empty_bf16(5, 4), # result_rms + empty_bf16(5, 4), # input + empty_bf16(1, 5), # weight + empty_fp32(1, 1) # scale + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, + pm_pass) + + +class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): + + def __init__(self, + epsilon: float, + quant_dtype: torch.dtype, + symmetric=True): + key = FusedRMSQuantKey(fused_add=True, + quant=QuantKey(dtype=quant_dtype, + static=True, + per_tensor=True, + symmetric=symmetric)) + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass, + record_match: Callable[[MultiOutputMatch], bool]): + + def pattern(result: torch.Tensor, input: torch.Tensor, + residual: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(RMS_ADD_OP, + input=input, + residual=residual, + weight=weight, + epsilon=self.epsilon) + at1 = auto_functionalized(self.QUANT_OP, + result=result, + input=at[1], + scale=scale) + + # result, residual + return at1[1], at[2] + + def replacement(result: torch.Tensor, input: torch.Tensor, + residual: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(self.FUSED_OP, + result=result, + input=input, + residual=residual, + weight=weight, + scale=scale, + epsilon=self.epsilon) + + # result, residual + return at[1], at[2] + + inputs = [ + torch.empty(5, 4, device="cuda", dtype=self.quant_dtype), # result + empty_bf16(5, 4), # input + empty_bf16(5, 4), # residual + empty_bf16(1, 5), # weight + empty_fp32(1, 1) # scale + ] + + pm.register_replacement( + pattern, + replacement, + inputs, + pm.fwd_only, + pm_pass, + extra_check=lambda m: record_match( + self.Match(m, self.QUANT_OP, self.FUSED_OP))) + + class Match(QuantMultiOutputMatch): + + def process(self): + # Find the nodes in the match that we need to rebind + rms_node = self.find_auto_fn(RMS_ADD_OP) + quant_node = self.find_auto_fn(self.QUANT_OP) + + assert len(rms_node.users) == 2 + assert len(quant_node.users) == 1 + + # First, insert a new auto_functionalized node for the fused op, + # as well as getitem nodes to extract the result and residual. + # The auto_fn node returns a tuple of (None, result, residual). + # + # The resulting graph looks like this: + # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...) # noqa + # result_node_new = at[1] + # residual_node_new = at[2] + with self.inserting_after_match(): + # Missing epsilon, scalars cannot be inputs to the pattern + kwargs = self.match.kwargs.copy() + + # 0 is always None + fused_return_mapping = {1: (quant_node, 1), 2: (rms_node, 2)} + self.insert_fused_node(fused_return_mapping, + epsilon=rms_node.kwargs["epsilon"], + **kwargs) + + +class RMSNormDynamicQuantPattern(RMSNormQuantPattern): + + def __init__(self, + epsilon: float, + quant_dtype: torch.dtype, + per_tensor: bool, + symmetric=True): + key = FusedRMSQuantKey(fused_add=False, + quant=QuantKey(dtype=quant_dtype, + static=False, + per_tensor=per_tensor, + symmetric=symmetric)) + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass, + record_match: Callable[[MultiOutputMatch], bool]): + + def pattern(result: torch.Tensor, result_rms: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at1 = auto_functionalized(RMS_OP, + result=result_rms, + input=input, + weight=weight, + epsilon=self.epsilon) + at2 = auto_functionalized(self.QUANT_OP, + result=result, + input=at1[1], + scale=scale, + scale_ub=None) + + # result, scale + return at2[1], at2[2] + + def replacement(result: torch.Tensor, result_rms: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + scale_ub=None, + residual=None) + + # result, scale + return at[1], at[2] + + inputs = [ + torch.empty(5, 4, device="cuda", dtype=self.quant_dtype), # result + empty_bf16(5, 4), # result_rms + empty_bf16(5, 4), # input + empty_bf16(1, 5), # weight + empty_fp32(1, 1) # scale + ] + + pm.register_replacement( + pattern, + replacement, + inputs, + pm.fwd_only, + pm_pass, + extra_check=lambda m: record_match( + self.Match(m, self.QUANT_OP, self.FUSED_OP))) + class Match(QuantMultiOutputMatch): -# Returns the first auto_functionalized node with the given op (if it exists) -def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node], - op) -> Optional[torch.fx.Node]: - for node in nodes: - if is_func(node, auto_functionalized) and node.args[0] == op: # noqa - return node - return None + def process(self): + # Find the nodes in the match that we need to rebind + rms_node = self.find_auto_fn(RMS_OP) + quant_node = self.find_auto_fn(self.QUANT_OP) + assert len(rms_node.users) == 1 + assert len(quant_node.users) == 2 -# Returns the first auto_functionalized node with the given op -def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node: - node = find_auto_fn_maybe(nodes, op) - assert node is not None, f"Could not find {op} in nodes {nodes}" - return node + # First, insert a new auto_functionalized node for the fused op, + # as well as getitem nodes to extract the result and scale. + # The auto_fn node returns a tuple of (None, result, scale). + # + # The resulting graph looks like this: + # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...) # noqa + # result_node_new = at[1] + # scale_node_new = at[2] + with self.inserting_after_match(): + # Missing epsilon, scalars cannot be inputs to the pattern + kwargs = self.match.kwargs.copy() + del kwargs["result_rms"] # not used in the fused op + + fused_return_mapping = {1: (quant_node, 1), 2: (quant_node, 2)} + self.insert_fused_node( + fused_return_mapping, + epsilon=rms_node.kwargs["epsilon"], + scale_ub=None, # not used but required + residual=None, # not used but required + **kwargs) + + +class FusedAddRMSNormDynamicQuantPattern(RMSNormQuantPattern): + + def __init__(self, + epsilon: float, + quant_dtype: torch.dtype, + per_tensor: bool = True, + symmetric=True): + key = FusedRMSQuantKey(fused_add=True, + quant=QuantKey(dtype=quant_dtype, + static=False, + per_tensor=per_tensor, + symmetric=symmetric)) + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass, + record_match: Callable[[MultiOutputMatch], bool]): + + def pattern(result: torch.Tensor, input: torch.Tensor, + residual: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(RMS_ADD_OP, + input=input, + residual=residual, + weight=weight, + epsilon=self.epsilon) + at1 = auto_functionalized(self.QUANT_OP, + result=result, + input=at[1], + scale=scale, + scale_ub=None) + + # result, residual, scale + return at1[1], at[2], at1[2] + + def replacement(result: torch.Tensor, input: torch.Tensor, + residual: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + scale_ub=None, + residual=residual) + + # result, residual, scale + return at[1], at[3], at[2] + inputs = [ + torch.empty(5, 4, device="cuda", dtype=self.quant_dtype), # result + empty_bf16(5, 4), # input + empty_bf16(5, 4), # residual + empty_bf16(1, 5), # weight + empty_fp32(1, 1) # scale + ] -# Returns the getitem node that extracts the idx-th element from node -# (if it exists) -def find_getitem_maybe(node: torch.fx.Node, - idx: int) -> Optional[torch.fx.Node]: - for user in node.users: - if is_func(user, operator.getitem) and user.args[1] == idx: - return user - return None + pm.register_replacement( + pattern, + replacement, + inputs, + pm.fwd_only, + pm_pass, + extra_check=lambda m: record_match( + self.Match(m, self.QUANT_OP, self.FUSED_OP))) + class Match(QuantMultiOutputMatch): -# Returns the getitem node that extracts the idx-th element from node -def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node: - ret = find_getitem_maybe(node, idx) - assert ret is not None, f"Could not find getitem {idx} in node {node}" - return ret + def process(self): + # Find the nodes in the match that we need to rebind + rms_node = self.find_auto_fn(RMS_ADD_OP) + quant_node = self.find_auto_fn(self.QUANT_OP) + + assert len(rms_node.users) == 2 + assert len(quant_node.users) == 2 + + # First, insert a new auto_functionalized node for the fused op, + # as well as getitem nodes to extract result, scale, and residual. + # The auto_fn node returns a tuple (None, result, scale, residual). + # + # The resulting graph looks like this: + # at = auto_functionalized(torch.ops._C.rms_norm_dynamic_per_token_quant.default, ...) # noqa + # result_node_new = at[1] + # scale_node_new = at[2] + # residual_node_new = at[3] + with self.inserting_after_match(): + # Missing epsilon, scalars cannot be inputs to the pattern + kwargs = self.match.kwargs.copy() + + fused_return_mapping = { + 1: (quant_node, 1), # result + 2: (quant_node, 2), # scale + 3: (rms_node, 2), # residual + } + self.insert_fused_node( + fused_return_mapping, + epsilon=rms_node.kwargs["epsilon"], + scale_ub=None, # not used but required + **kwargs) class FusionPass(VllmInductorPass): @@ -158,41 +546,39 @@ def __init__(self, config: CompilationConfig.PassConfig): "FusionPass singleton instance already exists" super().__init__(config) - self.matches: List[Match] = [] + self.matches: List[MultiOutputMatch] = [] self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="fusion_pass") - # Fuse rms_norm + static_scaled_fp8_quant into - # rms_norm_static_fp8_quant - inputs = [ - empty_fp8(5, 4), - empty_bf16(5, 4), - empty_bf16(5, 4), - empty_bf16(1, 5), - empty_fp32(1, 1) - ] - register_replacement(rms_pattern_static, rms_replacement_static, - inputs, fwd_only, self.patterns) + for epsilon in [1e-5, 1e-6]: + # Fuse rms_norm + static fp8 quant + RMSNormStaticQuantPattern(epsilon, + FP8_DTYPE).register(self.patterns) - # Fuse fused_add_rms_norm + static_scaled_fp8_quant into - # fused_add_rms_norm_static_fp8_quant - # Because pattern has 2 outputs, we need to manually process the match - # (see process_matches) - inputs = [ - empty_fp8(5, 4), - empty_bf16(5, 4), - empty_bf16(5, 4), - empty_bf16(1, 5), - empty_fp32(1, 1) - ] - register_replacement(rms_pattern_residual_static, - rms_replacement_residual_static, - inputs, - fwd_only, - self.patterns, - extra_check=lambda m: self.record_match(m)) - - def record_match(self, match: Match) -> bool: + # Matches for patterns below have 2 or more outputs, + # so we need to process them manually (see process_matches) + + # Fuse rms_norm + static fp8 quant + FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register( + self.patterns, self.record_match) + + # Fuse rms_norm + dynamic per-token fp8 quant + RMSNormDynamicQuantPattern(epsilon, FP8_DTYPE, + per_tensor=False).register( + self.patterns, self.record_match) + + # Fuse fused_add_rms_norm + dynamic per-token fp8 quant + FusedAddRMSNormDynamicQuantPattern(epsilon, + FP8_DTYPE, + per_tensor=False).register( + self.patterns, + self.record_match) + + # WARNING: This is a hack to clear the pattern matcher cache + # and allow multiple values of epsilon. + torch._inductor.pattern_matcher._seen_patterns.clear() + + def record_match(self, match: MultiOutputMatch) -> bool: # Hijack the extra_check to record the match and # save it for post-processing. self.matches.append(match) @@ -200,83 +586,20 @@ def record_match(self, match: Match) -> bool: # Return False to prevent automatic replacement. return False - def process_matches(self, graph: torch.fx.Graph): + def process_matches(self, graph: fx.Graph): """ Manually process multi-output matches and replace them with fused nodes. - This is necessary because the automatic replacement for multi-output - matches is broken: https://github.com/pytorch/pytorch/issues/137280 + See MultiOutputMatch for more details. """ for match in self.matches: - # To avoid use-before-definition errors, insert replacement nodes - # after the last node in the match. - # match.nodes is not guaranteed to be sorted. - # Find the last node in the match. - for last_node_in_match in reversed(graph.nodes): - if last_node_in_match in match.nodes: - break - else: - raise ValueError("No nodes in graph") - - # Insert a new auto_functionalized node for the fused operation, - # as well as getitem nodes to extract the result and residual. - # The auto_functionalized node returns a tuple of - # (None, result, residual) - None is the function return value. - # The resulting graph looks like this: - # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...) # noqa - # result_node_new = at[1] - # residual_node_new = at[2] - with graph.inserting_after(last_node_in_match): - kwargs = match.kwargs - kwargs["epsilon"] = 1e-5 # Currently hard-coded in RMSNorm - - fused_node = graph.call_function( - auto_functionalized, - (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, - ), - kwargs=kwargs) - - graph.inserting_after(fused_node) - result_node_new = graph.call_function(operator.getitem, - (fused_node, 1)) - residual_node_new = graph.call_function( - operator.getitem, (fused_node, 2)) - - # Last part of replacement is rebinding the users of nodes in the - # match to use the new nodes. - - # Find the nodes in the match that we need to rebind - rms_node = find_auto_fn(match.nodes, - torch.ops._C.fused_add_rms_norm.default) - quant_node = find_auto_fn( - match.nodes, torch.ops._C.static_scaled_fp8_quant.default) - - assert len(rms_node.users) == 2 - assert len(quant_node.users) == 1 - - # meta["val"] is used by de-functionalization and has to contain the - # value of the node (tuple of tensors) that would be returned by the - # functionalized node during tracing. - - rms_tup = rms_node.meta["val"] - quant_tup = quant_node.meta["val"] - - # The result of fused_node must be a tuple with the first element - # None (the function return value) and the remaining elements - # representing the mutated inputs. - fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2]) - fused_node.meta["val"] = fused_tup - - # Find the getitem nodes and replace their uses with the new nodes. - # The old nodes will be removed by DCE at the end of the pass. - find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new) - find_getitem(quant_node, 1).replace_all_uses_with(result_node_new) + match.process() # Finally, remove matched nodes graph.eliminate_dead_code() assert all(node not in graph.nodes for match in self.matches - for node in match.nodes) + for node in match.match.nodes) - def __call__(self, graph: torch.fx.Graph): + def __call__(self, graph: fx.Graph): self.begin() self.dump_graph(graph, "before_fusion") diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py new file mode 100644 index 0000000000000..924e26f2e262e --- /dev/null +++ b/vllm/compilation/fx_utils.py @@ -0,0 +1,42 @@ +import operator +from typing import Iterable, Optional + +from torch import fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._ops import OpOverload + + +def is_func(node: fx.Node, target) -> bool: + return node.op == "call_function" and node.target == target + + +# Returns the first auto_functionalized node with the given op (if it exists) +def find_auto_fn_maybe(nodes: Iterable[fx.Node], + op: OpOverload) -> Optional[fx.Node]: + for node in nodes: + if is_func(node, auto_functionalized) and node.args[0] == op: # noqa + return node + return None + + +# Returns the first auto_functionalized node with the given op +def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node: + node = find_auto_fn_maybe(nodes, op) + assert node is not None, f"Could not find {op} in nodes {nodes}" + return node + + +# Returns the getitem node that extracts the idx-th element from node +# (if it exists) +def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]: + for user in node.users: + if is_func(user, operator.getitem) and user.args[1] == idx: + return user + return None + + +# Returns the getitem node that extracts the idx-th element from node +def find_getitem(node: fx.Node, idx: int) -> fx.Node: + ret = find_getitem_maybe(node, idx) + assert ret is not None, f"Could not find getitem {idx} in node {node}" + return ret diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py new file mode 100644 index 0000000000000..0ad648abfbb3a --- /dev/null +++ b/vllm/compilation/multi_output_match.py @@ -0,0 +1,105 @@ +import abc +import operator +from abc import abstractmethod +from typing import Iterable, List, Tuple + +from torch import fx +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor import pattern_matcher as pm +from torch._ops import OpOverload + +from vllm.compilation.fx_utils import find_auto_fn + + +class MultiOutputMatch(abc.ABC): + """ + This class provides utilities to process multi-output matches and + manually insert replacements. + + This is necessary because the automatic replacement for multi-output + matches is broken: https://github.com/pytorch/pytorch/issues/137280 + """ + + def __init__(self, match: pm.Match): + self.match = match + + @abstractmethod + def process(self): + """ + Process a multi-output match and manually insert the replacement. + + This method should: + 1. Insert the replacement nodes after the last node in the match. + 2. Rebind the users of nodes in the match to use the new nodes. + 3. Set meta["val"] for de-functionalization. + + The result of an auto-functionalized node is a tuple of tensors. + The first element is the return value of the function, usually None. + The remaining elements are the mutated args of the function. + + All auto-functionalized nodes must contain a proper meta["val"], + as it is used by de-functionalization. meta["val"] has to contain the + value of the node (tuple of tensors) that would be returned by the + functionalized node during tracing. + + Existing nodes in the graph all have this property set, but we have + to set it manually for new nodes we insert. + + Example: + # op schema: foo(a: Tensor!, b: Tensor, c: Tensor!) -> None + at = auto_functionalized(torch.ops._C.foo.default, a, b, c) + # at.meta["val"] = (None, a, c) + """ + raise NotImplementedError + + @property + def nodes(self) -> List[fx.Node]: + return self.match.nodes + + @property + def graph(self) -> fx.Graph: + return self.match.graph + + def find_auto_fn(self, op) -> fx.Node: + """ + Find the first auto_functionalized node with the given op in the match. + """ + return find_auto_fn(self.nodes, op) + + def inserting_after_match(self): + """ + Insert nodes after the last node in the match. + This is done to avoid use-before-definition errors after inserting + replacement nodes. + """ + + # match.nodes is not guaranteed to be sorted. + # Find the last node in the match. + for last_node_in_match in reversed(self.graph.nodes): + if last_node_in_match in self.match.nodes: + break + else: + raise ValueError("No nodes in graph") + + return self.graph.inserting_after(last_node_in_match) + + def insert_getitems(self, tuple_node: fx.Node, + indices: Iterable[int]) -> Tuple[fx.Node, ...]: + """ + Insert operator.getitem nodes to extract elements from a tuple node. + + :param tuple_node: The tuple node to extract elements from. + :param indices: The indices of the elements to extract. + :return: Tuple of the new getitem nodes, corresponding to the indices. + """ + with self.graph.inserting_after(tuple_node): + return tuple( + self.graph.call_function(operator.getitem, (tuple_node, idx)) + for idx in indices) + + def insert_auto_fn(self, op: OpOverload, kwargs): + """ + Insert an auto_functionalized node with the given op and kwargs. + """ + return self.graph.call_function(auto_functionalized, (op, ), + kwargs=kwargs) diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py index 63a369fe8d966..ba28b1f0be7bd 100644 --- a/vllm/compilation/reshapes.py +++ b/vllm/compilation/reshapes.py @@ -5,7 +5,8 @@ from vllm.logger import init_logger -from .vllm_inductor_pass import VllmInductorPass, is_func +from .fx_utils import is_func +from .vllm_inductor_pass import VllmInductorPass logger = init_logger(__name__) diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index dbf6b8f7789e1..b8c52a7f46838 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -16,10 +16,6 @@ logger = init_logger(__name__) -def is_func(node: torch.fx.Node, target) -> bool: - return node.op == "call_function" and node.target == target - - class VllmInductorPass(InductorPass): """ An inductor pass with access to vLLM PassConfig. From 1efce686053c15cd6f84361bb0bd1898fbb23a82 Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Thu, 12 Dec 2024 20:09:53 -0800 Subject: [PATCH 03/56] [Bugfix] Use runner_type instead of task in GritLM (#11144) Signed-off-by: Pooya Davoodi --- tests/models/embedding/language/test_gritlm.py | 6 +++--- vllm/model_executor/models/gritlm.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index b947265be9e9d..55c2e5d4ed412 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -35,7 +35,7 @@ def test_find_array(monkeypatch): from vllm.model_executor.models.gritlm import GritLMPooler # Create an LLM object to get the model config. - llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN) + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) pooler = GritLMPooler(model_config=llm.llm_engine.model_config) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) @@ -55,7 +55,7 @@ def server_embedding(): with pytest.MonkeyPatch.context() as mp: mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") - args = ["--task", "embedding", "--max_model_len", str(MAX_MODEL_LEN)] + args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -141,7 +141,7 @@ def test_gritlm_offline_embedding(monkeypatch): queries, q_instruction, documents, d_instruction = get_test_data() - llm = vllm.LLM(MODEL_NAME, task="embedding", max_model_len=MAX_MODEL_LEN) + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) d_rep = run_llm_encode( llm, diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index ec01a07c16a62..34c1332ac4a66 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -203,12 +203,12 @@ def __init__( ) -> None: super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) - self.task = vllm_config.model_config.task + self.runner_type = vllm_config.model_config.runner_type self._pooler = GritLMPooler(vllm_config.model_config) for layer in self.model.layers: - if self.task == "embedding" and hasattr(layer, "self_attn"): + if self.runner_type == "pooling" and hasattr(layer, "self_attn"): assert isinstance(layer.self_attn.attn.impl, XFormersImpl), ( "GritLM embedding is only supported by XFormers backend, " "which can be forced by VLLM_ATTENTION_BACKEND=XFORMERS") @@ -222,8 +222,8 @@ def forward( **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: - # Change attention to non-causal for embedding task. - if self.task == "embedding": + # Change attention to non-causal for pooling tasks. + if self.runner_type == "pooling": assert attn_metadata.prefill_metadata.attn_bias is None attn_metadata.prefill_metadata.attn_bias = [ BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens) From 3989a798249bfa24b6dd22aff599796fcf92dce9 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 13 Dec 2024 00:07:20 -0500 Subject: [PATCH 04/56] [Bugfix] Update starcoder2 to remap k/v scale names for kv_cache quantization (#11148) --- vllm/model_executor/models/starcoder2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 15e8f2af52cda..22189a517d313 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -37,7 +37,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -345,6 +346,10 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight, shard_id) break else: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if self.config.tie_word_embeddings and "lm_head.weight" in name: continue if is_pp_missing_parameter(name, self): From 00c1bde5d8cd30b14f661b11d9ad1c1d4470ddbf Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Fri, 13 Dec 2024 00:31:26 -0500 Subject: [PATCH 05/56] [ROCm][AMD] Disable auto enabling chunked prefill on ROCm (#11146) Signed-off-by: Gregory Shtrasberg --- vllm/engine/arg_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0c28fe7032728..0098648b1cd60 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1074,7 +1074,8 @@ def create_engine_config(self, if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and model_config.runner_type != "pooling"): + and model_config.runner_type != "pooling" + and not current_platform.is_rocm()): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " From 34f1a806d5771c4ee81fdaf4feb7f9fd4071d779 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Thu, 12 Dec 2024 22:30:06 -0800 Subject: [PATCH 06/56] [Bugfix][V1] Fix 'NoneType' object has no attribute 'hash_value' (#11157) Signed-off-by: Cody Yu --- vllm/v1/core/kv_cache_manager.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 03cbb958237df..8044481a9cd6a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -164,13 +164,14 @@ def append_slots( new_full_blocks = req_blocks[ num_computed_full_blocks:num_full_blocks_after_append] - self._cache_full_blocks( - request=request, - blk_start_idx=num_computed_full_blocks, - full_blocks=new_full_blocks, - prev_block=req_blocks[num_computed_full_blocks - 1] - if num_computed_full_blocks >= 1 else None, - ) + if new_full_blocks: + self._cache_full_blocks( + request=request, + blk_start_idx=num_computed_full_blocks, + full_blocks=new_full_blocks, + prev_block=req_blocks[num_computed_full_blocks - 1] + if num_computed_full_blocks >= 1 else None, + ) return new_blocks @@ -375,8 +376,13 @@ def _cache_full_blocks( prev_block: The previous block in the chain. """ # Update the new blocks with the block hashes through the chain. - prev_block_hash_value = (prev_block.block_hash.hash_value - if prev_block is not None else None) + prev_block_hash_value = None + if prev_block is not None: + # Previous block must have a block hash because it must be + # a full, cached block. + assert prev_block.block_hash is not None + prev_block_hash_value = prev_block.block_hash.hash_value + for i, blk in enumerate(full_blocks): blk_idx = blk_start_idx + i From be39e3cd18781c4571410323f3c767e67240eb51 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 12 Dec 2024 22:57:50 -0800 Subject: [PATCH 07/56] [core] clean up cudagraph batchsize padding logic (#10996) Signed-off-by: youkaichao --- .../decoder_only/language/test_jamba.py | 5 +- .../decoder_only/language/test_mamba.py | 5 +- .../test_encoder_decoder_model_runner.py | 4 +- tests/worker/test_model_runner.py | 4 +- vllm/config.py | 171 +++++++++++------- vllm/model_executor/models/jamba.py | 20 +- vllm/model_executor/models/mamba.py | 21 ++- vllm/v1/worker/gpu_model_runner.py | 11 +- vllm/worker/enc_dec_model_runner.py | 2 +- vllm/worker/model_runner.py | 7 +- vllm/worker/xpu_model_runner.py | 4 - 11 files changed, 150 insertions(+), 104 deletions(-) diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index cae25ae9fa2c8..057b04349e8b7 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,7 +1,7 @@ import pytest from tests.utils import multi_gpu_test -from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs from vllm.sampling_params import SamplingParams from ...utils import check_outputs_equal @@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == VllmConfig.get_graph_batch_size( + vllm_config = EngineArgs(model=model).create_engine_config() + while len(example_prompts) == vllm_config.pad_for_cudagraph( len(example_prompts)): example_prompts.append(example_prompts[0]) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 35018c3c14dee..06739e8f02253 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -5,7 +5,7 @@ import pytest from transformers import AutoModelForCausalLM, AutoTokenizer -from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs from vllm.sampling_params import SamplingParams from ...utils import check_outputs_equal @@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == VllmConfig.get_graph_batch_size( + vllm_config = EngineArgs(model=model).create_engine_config() + while len(example_prompts) == vllm_config.pad_for_cudagraph( len(example_prompts)): example_prompts.append(example_prompts[0]) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 5289c91f201cd..a6b3cb5759f2b 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -4,7 +4,6 @@ import pytest import torch -from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata @@ -548,7 +547,8 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): # With CUDA Graph capture and replay enabled, the decoder and encoder # input sequences will be padded. Create the expected padded tensors # accordingly. - graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size) + graph_batch_size = model_runner.vllm_config.pad_for_cudagraph( + expanded_batch_size) cuda_graph_pad_size = graph_batch_size - expanded_batch_size padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size)) padded_encoder_seq_lens = encoder_seq_lens + list( diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 4055524f3e0c7..aabe913c242e1 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -3,7 +3,6 @@ import pytest import torch -from vllm.config import VllmConfig from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.engine.arg_utils import EngineArgs @@ -177,7 +176,8 @@ def test_prepare_decode_cuda_graph(batch_size): model_input.attn_metadata, model_input.attn_metadata.slot_mapping) assert len(slot_mapping) == len(input_tokens) - expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list)) + expected_bs = model_runner.vllm_config.pad_for_cudagraph( + len(seq_group_metadata_list)) # Verify input metadata is correct for prompts. device = model_runner.device assert attn_metadata.num_prefills == 0 diff --git a/vllm/config.py b/vllm/config.py index 08a7b607630af..12ed80c366e43 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2354,6 +2354,12 @@ def model_post_init(self, __context: Any) -> None: # not configurable, computed after init compile_sizes: List[int] = PrivateAttr capture_sizes: List[int] = PrivateAttr + max_capture_size: int = PrivateAttr + # optimization: + # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. + # since we know all keys are in a range [0, max_capture_size], + # we can optimize it to List[int] for better lookup performance. + bs_to_padded_graph_size: List[int] = PrivateAttr # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = PrivateAttr @@ -2365,6 +2371,19 @@ def model_post_init(self, __context: Any) -> None: # Map from layer name to the attention cls static_forward_context: Dict[str, Any] = PrivateAttr + def __repr__(self) -> str: + exclude = { + "static_forward_context", + "enabled_custom_ops", + "disabled_custom_ops", + "compilation_time", + "bs_to_padded_graph_size", + "pass_config", + } + return self.model_dump_json(exclude=exclude, exclude_unset=True) + + __str__ = __repr__ + @classmethod def from_cli(cls, cli_value: str) -> "CompilationConfig": """Parse the CLI value for the compilation config.""" @@ -2450,18 +2469,22 @@ def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): # sort to make sure cudagraph capture sizes are in descending order self.capture_sizes.sort(reverse=True) + self.max_capture_size = self.capture_sizes[ + 0] if self.capture_sizes else 0 - -_BATCH_SIZE_ALIGNMENT = 8 -# all the token sizes that **can** be captured by cudagraph. -# they can be arbitrarily large. -# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192. -# the actual sizes to capture will be determined by the model, -# depending on the model's max_num_seqs. -# NOTE: get_graph_batch_size needs to be updated if this list is changed. -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025) -] + # pre-compute the mapping from batch size to padded graph size + self.bs_to_padded_graph_size = [ + 0 for i in range(self.max_capture_size + 1) + ] + for end, start in zip(self.capture_sizes, + self.capture_sizes[1:] + [0]): + for bs in range(start, end): + if bs == start: + self.bs_to_padded_graph_size[bs] = start + else: + self.bs_to_padded_graph_size[bs] = end + self.bs_to_padded_graph_size[ + self.max_capture_size] = self.max_capture_size @dataclass @@ -2491,40 +2514,12 @@ class VllmConfig: init=True) # type: ignore instance_id: str = "" - @staticmethod - def get_graph_batch_size(batch_size: int) -> int: - """Returns the padded batch size given actual batch size. - - Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, - 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... - """ - if batch_size <= 2: - return batch_size - elif batch_size <= 4: - return 4 - else: - return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // - _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) - - @staticmethod - def get_max_graph_batch_size(max_num_seqs: int) -> int: - """ - max_num_seqs: Maximum number of sequences in a batch. - _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture. - - pad the max_num_seqs if necessary by calling get_graph_batch_size, - which will deal with some edge cases like 1, 2, 4. - - if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded - size. if not, it means the padded size is larger than the largest size - in _BATCH_SIZES_TO_CAPTURE, return the largest size in - _BATCH_SIZES_TO_CAPTURE. - """ - padded_size = VllmConfig.get_graph_batch_size(max_num_seqs) - if padded_size in _BATCH_SIZES_TO_CAPTURE: - return padded_size - assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1] - return _BATCH_SIZES_TO_CAPTURE[-1] + def pad_for_cudagraph(self, batch_size: int) -> int: + # if batch_size > self.compilation_config.max_capture_size, + # it should raise an IndexError. + # the caller should make sure the batch_size is within the range, + # i.e., batch_size <= self.compilation_config.max_capture_size + return self.compilation_config.bs_to_padded_graph_size[batch_size] @staticmethod def _get_quantization_config( @@ -2618,27 +2613,7 @@ def __post_init__(self): self.compilation_config.pass_config.enable_reshape = False self.compilation_config.level = CompilationLevel.PIECEWISE - if not envs.VLLM_USE_V1: - max_batchsize_to_capture = 0 - if self.scheduler_config is not None and \ - self.model_config is not None and \ - not self.model_config.enforce_eager: - max_batchsize_to_capture = \ - self.get_max_graph_batch_size( - self.scheduler_config.max_num_seqs) - batch_size_capture_list = [ - size for size in _BATCH_SIZES_TO_CAPTURE - if size <= max_batchsize_to_capture - ] - else: - batch_size_capture_list = [] - if self.model_config is not None and \ - not self.model_config.enforce_eager: - batch_size_capture_list = [1, 2, 4 - ] + [i for i in range(8, 513, 8)] - - self.compilation_config.init_with_cudagraph_sizes( - batch_size_capture_list) + self._set_cudagraph_sizes() if self.cache_config is not None and \ self.cache_config.cpu_offload_gb > 0 and \ @@ -2659,6 +2634,70 @@ def __post_init__(self): if not self.instance_id: self.instance_id = random_uuid()[:5] + def _set_cudagraph_sizes(self): + """ + cudagraph batchsize padding logic: + + `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible + batch sizes that cudagraph will capture. + + Depending on the engine's configuration of `max_num_seqs`, the + candidate batch sizes to capture cudagraph will shrink to the subset + which just cover the range of `[1, max_num_seqs]`. In the common case, + `max_num_seqs` is 256, and the cudagraph batch sizes will be + `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`. + + However, if users specify the cudagraph capture sizes through + compilation config, we will use the specified sizes instead. + + In the end, `vllm_config.compilation_config.capture_sizes` will be the + final sizes to capture cudagraph (in descending order). + + During runtime, if batchsize is larger than + `vllm_config.compilation_config.capture_sizes`, + no cudagraph will be used. + If the batch size is no larger than + `vllm_config.compilation_config.capture_sizes`, + we can quickly find the padded graph size for a given batch size by + looking up `vllm_config.compilation_config.bs_to_padded_graph_size`. + """ + + # calculate the default `batch_size_capture_list` + if not envs.VLLM_USE_V1: + batch_size_capture_list = [] + max_batchsize_to_capture = 0 + if self.scheduler_config is not None and \ + self.model_config is not None and \ + not self.model_config.enforce_eager: + + possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)] + # find the minimum size that is larger than max_num_seqs, + # which then becomes the max_batchsize_to_capture + larger_sizes = [ + x for x in possible_sizes + if x >= self.scheduler_config.max_num_seqs + ] + if larger_sizes: + max_batchsize_to_capture = larger_sizes[0] + else: + max_batchsize_to_capture = possible_sizes[-1] + + # filter out the sizes that are + # larger than max_batchsize_to_capture + batch_size_capture_list = [ + size for size in possible_sizes + if size <= max_batchsize_to_capture + ] + else: + batch_size_capture_list = [] + if self.model_config is not None and \ + not self.model_config.enforce_eager: + batch_size_capture_list = [1, 2, 4 + ] + [i for i in range(8, 513, 8)] + + self.compilation_config.init_with_cudagraph_sizes( + batch_size_capture_list) + def __str__(self): return ( f"model={self.model_config.model!r}," diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 6bb4c13ab35df..831db2ae52d74 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention -from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE @@ -420,6 +420,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + if self.scheduler_config is not None and \ + not self.model_config.enforce_eager: + if self.scheduler_config.max_num_seqs > \ + vllm_config.compilation_config.max_capture_size: + self.max_batch_size = \ + vllm_config.compilation_config.max_capture_size + else: + self.max_batch_size = vllm_config.pad_for_cudagraph( + self.scheduler_config.max_num_seqs) + else: + self.max_batch_size = 8192 + 2 def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -433,15 +444,12 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - max_batch_size = (VllmConfig.get_graph_batch_size( - self.scheduler_config.max_num_seqs) if self.scheduler_config - else max(_BATCH_SIZES_TO_CAPTURE) + 2) num_mamba_layers = self.model_config.get_num_layers_by_block_type( self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( - self.lm_head.weight.dtype, num_mamba_layers, max_batch_size, - *self._get_mamba_cache_shape()) + self.lm_head.weight.dtype, num_mamba_layers, + self.max_batch_size, *self._get_mamba_cache_shape()) ( mamba_cache_tensors, state_indices_tensor, diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 1f5cd02711899..06c8d9723cd01 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -6,7 +6,7 @@ from transformers import MambaConfig from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm @@ -195,6 +195,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.backbone.make_empty_intermediate_tensors) + if self.scheduler_config is not None and \ + not self.model_config.enforce_eager: + if self.scheduler_config.max_num_seqs > \ + vllm_config.compilation_config.max_capture_size: + self.max_batch_size = \ + vllm_config.compilation_config.max_capture_size + else: + self.max_batch_size = vllm_config.pad_for_cudagraph( + self.scheduler_config.max_num_seqs) + else: + self.max_batch_size = 8192 + 2 def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.backbone.get_input_embeddings(input_ids) @@ -208,15 +219,11 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - max_batch_size = (VllmConfig.get_graph_batch_size( - self.scheduler_config.max_num_seqs) if self.scheduler_config - else max(_BATCH_SIZES_TO_CAPTURE) + 2) - num_mamba_layers = self.model_config.get_num_layers_by_block_type( self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( - self.lm_head.weight.dtype, num_mamba_layers, max_batch_size, - *self._get_mamba_cache_shape()) + self.lm_head.weight.dtype, num_mamba_layers, + self.max_batch_size, *self._get_mamba_cache_shape()) ( mamba_cache_tensors, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index aa91255e68d48..f24942068d1f8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,6 @@ import gc import time -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Dict, List, Tuple import numpy as np import torch @@ -459,7 +459,7 @@ def execute_model( and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): # Use piecewise CUDA graphs. # Add padding to the batch size. - num_input_tokens = self._get_padded_batch_size( + num_input_tokens = self.vllm_config.pad_for_cudagraph( num_scheduled_tokens) else: # Eager mode. @@ -641,10 +641,3 @@ def initialize_kv_cache(self, num_blocks: int) -> None: torch.zeros(kv_cache_shape, dtype=self.kv_cache_dtype, device=self.device)) - - def _get_padded_batch_size(self, batch_size: int) -> Optional[int]: - # TODO: Optimize this? - for size in self.cudagraph_batch_sizes: - if batch_size <= size: - return size - return None diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 5697fbbaa2041..bff01320d7927 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -464,7 +464,7 @@ def _prepare_encoder_model_input_tensors( # We will be using CUDA graph replay for this decode. max_len_of_block_table = self.get_max_block_per_batch() batch_size = len(encoder_seq_lens) - graph_batch_size = self.vllm_config.get_graph_batch_size( + graph_batch_size = self.vllm_config.pad_for_cudagraph( batch_size) assert graph_batch_size >= batch_size cuda_graph_pad_size = graph_batch_size - batch_size diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 26fd486130ce6..6ff98a8f1bab2 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -802,7 +802,8 @@ def _get_cuda_graph_pad_size(self, max_encoder_seq_len): return -1 - graph_batch_size = VllmConfig.get_graph_batch_size(batch_size) + graph_batch_size = self.runner.vllm_config.pad_for_cudagraph( + batch_size) assert graph_batch_size >= batch_size return graph_batch_size - batch_size @@ -1014,8 +1015,8 @@ def __init__( self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size( - self.scheduler_config.max_num_seqs) + self.max_batchsize_to_capture = \ + self.vllm_config.compilation_config.max_capture_size self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ {} for _ in range(self.parallel_config.pipeline_parallel_size) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index e6322e095bbb9..9cf25387560da 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -37,10 +37,6 @@ logger = init_logger(__name__) _PAD_SLOT_ID = -1 -_BATCH_SIZE_ALIGNMENT = 8 -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) -] TModelInputForXPU = TypeVar('TModelInputForXPU', bound="ModelInputForXPU") From 7cd7409142ff97aee1a13568753db9263fcf8f6b Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Fri, 13 Dec 2024 09:40:07 +0200 Subject: [PATCH 08/56] PaliGemma 2 support (#11142) --- docs/source/models/supported_models.rst | 4 ++-- examples/offline_inference_vision_language.py | 13 +++++++++++++ vllm/model_executor/models/paligemma.py | 11 ++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 35aa3bfdd12b7..cae4a88de1638 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -664,9 +664,9 @@ Text Generation (``--task generate``) - ✅︎ - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - - PaliGemma + - PaliGemma, PaliGemma 2 - T + I\ :sup:`E` - - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. + - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. - - ✅︎ - diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 5e210126dc8fe..c430f42fdc814 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -137,6 +137,18 @@ def run_paligemma(question: str, modality: str): return llm, prompt, stop_token_ids +# PaliGemma 2 +def run_paligemma2(question: str, modality: str): + assert modality == "image" + + # PaliGemma 2 has special prompt format for VQA + prompt = "caption en" + llm = LLM(model="google/paligemma2-3b-ft-docci-448", + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + # Chameleon def run_chameleon(question: str, modality: str): assert modality == "image" @@ -473,6 +485,7 @@ def run_mantis(question: str, modality: str): "fuyu": run_fuyu, "phi3_v": run_phi3v, "paligemma": run_paligemma, + "paligemma2": run_paligemma2, "chameleon": run_chameleon, "minicpmv": run_minicpmv, "blip-2": run_blip2, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 253e689e50a3b..f9ad0c67adaba 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -105,6 +105,11 @@ def input_processor_for_paligemma(ctx: InputContext, orig_prompt_ids.remove(hf_config.image_token_index) new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n" + + # The PaliGemma 2 tokenizer does not include a starting BOS token + if orig_prompt_ids[0] != hf_config.bos_token_id: + orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids + new_token_ids = image_token_ids_pad + orig_prompt_ids + [108] #newline # NOTE: Create a defensive copy of the original inputs @@ -149,7 +154,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): projection_dim=config.vision_config.projection_dim) self.quant_config = quant_config - config.text_config.architectures = ["GemmaForCausalLM"] + + if config.text_config.model_type == "gemma": + config.text_config.architectures = ["GemmaForCausalLM"] + else: + config.text_config.architectures = ["Gemma2ForCausalLM"] self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.text_config, From f93bf2b1897cca5b644fe03f31925e4faff40056 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Fri, 13 Dec 2024 16:50:35 +0800 Subject: [PATCH 09/56] [Bugfix][CI][CPU] add missing datasets package to requirements-cpu.txt (#11159) Signed-off-by: jiang1.li --- requirements-cpu.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-cpu.txt b/requirements-cpu.txt index db8ad9d3a015d..e62f313297762 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -4,4 +4,5 @@ # Dependencies for CPUs torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" torch==2.5.1; platform_machine == "aarch64" -torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch \ No newline at end of file +torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +datasets # for benchmark scripts \ No newline at end of file From eeec9e339005d887e0064f7b3e7771295ecd68e7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Dec 2024 18:40:07 +0800 Subject: [PATCH 10/56] [Frontend] Separate pooling APIs in offline inference (#11129) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 7 +- docs/source/models/pooling_models.rst | 53 +++- examples/offline_inference_classification.py | 28 ++ examples/offline_inference_embedding.py | 16 +- examples/offline_inference_scoring.py | 23 ++ ...ine_inference_vision_language_embedding.py | 2 +- tests/conftest.py | 18 +- tests/entrypoints/openai/test_score.py | 10 +- .../models/embedding/language/test_scoring.py | 10 +- tests/models/test_oot_registration.py | 5 +- vllm/__init__.py | 36 +-- vllm/engine/llm_engine.py | 17 +- vllm/entrypoints/llm.py | 143 ++++++++- vllm/entrypoints/openai/protocol.py | 2 +- vllm/entrypoints/openai/serving_embedding.py | 9 +- vllm/entrypoints/openai/serving_engine.py | 12 +- vllm/entrypoints/openai/serving_score.py | 12 +- vllm/model_executor/layers/pooler.py | 288 ++++++++++++------ vllm/model_executor/models/gritlm.py | 15 +- vllm/outputs.py | 207 ++++++++----- vllm/sequence.py | 40 ++- 21 files changed, 659 insertions(+), 294 deletions(-) create mode 100644 examples/offline_inference_classification.py create mode 100644 examples/offline_inference_scoring.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6a6ee3cf713ae..97aae233db105 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -181,14 +181,14 @@ steps: commands: - VLLM_USE_V1=1 pytest -v -s v1 -- label: Examples Test # 15min +- label: Examples Test # 25min working_dir: "/vllm-workspace/examples" #mirror_hardwares: [amd] source_file_dependencies: - vllm/entrypoints - examples/ commands: - - pip install awscli tensorizer # for llava example and tensorizer test + - pip install tensorizer # for tensorizer test - python3 offline_inference.py - python3 cpu_offload.py - python3 offline_inference_chat.py @@ -198,6 +198,9 @@ steps: - python3 offline_inference_vision_language_multi_image.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py + - python3 offline_inference_classification.py + - python3 offline_inference_embedding.py + - python3 offline_inference_scoring.py - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst index 7fa66274c3c5a..94475c5e6689d 100644 --- a/docs/source/models/pooling_models.rst +++ b/docs/source/models/pooling_models.rst @@ -6,7 +6,7 @@ Pooling Models vLLM also supports pooling models, including embedding, reranking and reward models. In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface. -These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input +These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input before returning them. .. note:: @@ -45,20 +45,48 @@ which takes priority over both the model's and Sentence Transformers's defaults. ^^^^^^^^^^^^^^ The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM. -It returns the aggregated hidden states directly. +It returns the extracted hidden states directly, which is useful for reward models. + +.. code-block:: python + + llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") + output, = llm.encode("Hello, my name is") + + data = output.outputs.data + print(f"Prompt: {prompt!r} | Data: {data!r}") + +``LLM.embed`` +^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt. +It is primarily designed for embedding models. .. code-block:: python llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") - outputs = llm.encode("Hello, my name is") + output, = llm.embed("Hello, my name is") - outputs = model.encode(prompts) - for output in outputs: - embeddings = output.outputs.embedding - print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}") + embeds = output.outputs.embedding + print(f"Embeddings: {embeds!r} (size={len(embeds)})") A code example can be found in `examples/offline_inference_embedding.py `_. +``LLM.classify`` +^^^^^^^^^^^^^^^^ + +The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt. +It is primarily designed for classification models. + +.. code-block:: python + + llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") + output, = llm.classify("Hello, my name is") + + probs = output.outputs.probs + print(f"Class Probabilities: {probs!r} (size={len(probs)})") + +A code example can be found in `examples/offline_inference_classification.py `_. + ``LLM.score`` ^^^^^^^^^^^^^ @@ -71,7 +99,16 @@ These types of models serve as rerankers between candidate query-document pairs vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. To handle RAG at a higher level, you should use integration frameworks such as `LangChain `_. -You can use `these tests `_ as reference. +.. code-block:: python + + llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") + output, = llm.score("What is the capital of France?", + "The capital of Brazil is Brasilia.") + + score = output.outputs.score + print(f"Score: {score}") + +A code example can be found in `examples/offline_inference_scoring.py `_. Online Inference ---------------- diff --git a/examples/offline_inference_classification.py b/examples/offline_inference_classification.py new file mode 100644 index 0000000000000..de539b639a196 --- /dev/null +++ b/examples/offline_inference_classification.py @@ -0,0 +1,28 @@ +from vllm import LLM + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create an LLM. +# You should pass task="classify" for classification models +model = LLM( + model="jason9693/Qwen2.5-1.5B-apeach", + task="classify", + enforce_eager=True, +) + +# Generate logits. The output is a list of ClassificationRequestOutputs. +outputs = model.classify(prompts) + +# Print the outputs. +for prompt, output in zip(prompts, outputs): + probs = output.outputs.probs + probs_trimmed = ((str(probs[:16])[:-1] + + ", ...]") if len(probs) > 16 else probs) + print(f"Prompt: {prompt!r} | " + f"Class Probabilities: {probs_trimmed} (size={len(probs)})") diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py index 17f6d992073d7..58d004313ad51 100644 --- a/examples/offline_inference_embedding.py +++ b/examples/offline_inference_embedding.py @@ -9,14 +9,20 @@ ] # Create an LLM. +# You should pass task="embed" for embedding models model = LLM( model="intfloat/e5-mistral-7b-instruct", - task="embed", # You should pass task="embed" for embedding models + task="embed", enforce_eager=True, ) -# Generate embedding. The output is a list of PoolingRequestOutputs. -outputs = model.encode(prompts) +# Generate embedding. The output is a list of EmbeddingRequestOutputs. +outputs = model.embed(prompts) + # Print the outputs. -for output in outputs: - print(output.outputs.embedding) # list of 4096 floats +for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + embeds_trimmed = ((str(embeds[:16])[:-1] + + ", ...]") if len(embeds) > 16 else embeds) + print(f"Prompt: {prompt!r} | " + f"Embeddings: {embeds_trimmed} (size={len(embeds)})") diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference_scoring.py new file mode 100644 index 0000000000000..5da9e710959b5 --- /dev/null +++ b/examples/offline_inference_scoring.py @@ -0,0 +1,23 @@ +from vllm import LLM + +# Sample prompts. +text_1 = "What is the capital of France?" +texts_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." +] + +# Create an LLM. +# You should pass task="score" for cross-encoder models +model = LLM( + model="BAAI/bge-reranker-v2-m3", + task="score", + enforce_eager=True, +) + +# Generate scores. The output is a list of ScoringRequestOutputs. +outputs = model.score(text_1, texts_2) + +# Print the outputs. +for text_2, output in zip(texts_2, outputs): + score = output.outputs.score + print(f"Pair: {[text_1, text_2]!r} | Score: {score}") diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index bf466109f0981..4ce3d496bf45b 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -133,7 +133,7 @@ def run_encode(model: str, modality: QueryModality): if req_data.image is not None: mm_data["image"] = req_data.image - outputs = req_data.llm.encode({ + outputs = req_data.llm.embed({ "prompt": req_data.prompt, "multi_modal_data": mm_data, }) diff --git a/tests/conftest.py b/tests/conftest.py index 7606e0f11dfeb..4e939221329cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -719,14 +719,6 @@ def get_inputs( return inputs - def classify(self, prompts: List[str]) -> List[str]: - req_outputs = self.model.encode(prompts) - outputs = [] - for req_output in req_outputs: - embedding = req_output.outputs.embedding - outputs.append(embedding) - return outputs - def generate( self, prompts: List[str], @@ -897,6 +889,10 @@ def generate_beam_search( returned_outputs.append((token_ids, texts)) return returned_outputs + def classify(self, prompts: List[str]) -> List[List[float]]: + req_outputs = self.model.classify(prompts) + return [req_output.outputs.probs for req_output in req_outputs] + def encode( self, prompts: List[str], @@ -909,16 +905,16 @@ def encode( videos=videos, audios=audios) - req_outputs = self.model.encode(inputs) + req_outputs = self.model.embed(inputs) return [req_output.outputs.embedding for req_output in req_outputs] def score( self, text_1: Union[str, List[str]], text_2: Union[str, List[str]], - ) -> List[List[float]]: + ) -> List[float]: req_outputs = self.model.score(text_1, text_2) - return [req_output.outputs.embedding for req_output in req_outputs] + return [req_output.outputs.score for req_output in req_outputs] def __enter__(self): return self diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 7565ff7192f67..0698c19ad0023 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -39,8 +39,8 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, assert score.id is not None assert score.data is not None assert len(score.data) == 2 - assert score.data[0].score[0] <= 0.01 - assert score.data[1].score[0] >= 0.9 + assert score.data[0].score <= 0.01 + assert score.data[1].score >= 0.9 @pytest.mark.asyncio @@ -67,8 +67,8 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, assert score.id is not None assert score.data is not None assert len(score.data) == 2 - assert score.data[0].score[0] <= 0.01 - assert score.data[1].score[0] >= 0.9 + assert score.data[0].score <= 0.01 + assert score.data[1].score >= 0.9 @pytest.mark.asyncio @@ -90,4 +90,4 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, assert score.id is not None assert score.data is not None assert len(score.data) == 1 - assert score.data[0].score[0] >= 0.9 + assert score.data[0].score >= 0.9 diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index 0c3115d195fc1..af31e1a635f65 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -42,7 +42,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01) + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) @pytest.mark.parametrize("dtype", ["half"]) @@ -63,8 +63,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01) + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) @pytest.mark.parametrize("dtype", ["half"]) @@ -85,5 +85,5 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01) + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 94be215258f89..2c413a633896a 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -2,7 +2,7 @@ import pytest -from vllm import LLM, PoolingParams, SamplingParams +from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from ..utils import fork_new_process_for_each_test @@ -36,9 +36,8 @@ def test_oot_registration_text_generation(dummy_opt_path): def test_oot_registration_embedding(dummy_gemma2_embedding_path): os.environ["VLLM_PLUGINS"] = "register_dummy_model" prompts = ["Hello, my name is", "The text does not matter"] - sampling_params = PoolingParams() llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") - outputs = llm.encode(prompts, sampling_params) + outputs = llm.embed(prompts) for output in outputs: assert all(v == 0 for v in output.outputs.embedding) diff --git a/vllm/__init__.py b/vllm/__init__.py index a10f6d3128cb6..45252b93e3d54 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -7,8 +7,11 @@ from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.model_executor.models import ModelRegistry -from vllm.outputs import (CompletionOutput, PoolingOutput, - PoolingRequestOutput, RequestOutput) +from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput, + CompletionOutput, EmbeddingOutput, + EmbeddingRequestOutput, PoolingOutput, + PoolingRequestOutput, RequestOutput, ScoringOutput, + ScoringRequestOutput) from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -27,6 +30,12 @@ "CompletionOutput", "PoolingOutput", "PoolingRequestOutput", + "EmbeddingOutput", + "EmbeddingRequestOutput", + "ClassificationOutput", + "ClassificationRequestOutput", + "ScoringOutput", + "ScoringRequestOutput", "LLMEngine", "EngineArgs", "AsyncLLMEngine", @@ -34,26 +43,3 @@ "initialize_ray_cluster", "PoolingParams", ] - - -def __getattr__(name: str): - import warnings - - if name == "EmbeddingOutput": - msg = ("EmbeddingOutput has been renamed to PoolingOutput. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PoolingOutput - - if name == "EmbeddingRequestOutput": - msg = ("EmbeddingRequestOutput has been renamed to " - "PoolingRequestOutput. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PoolingRequestOutput - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d756f71e4fa53..dc2d77d6927cd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -46,11 +46,10 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest, - ParallelSampleSequenceGroup, Sequence, - SequenceGroup, SequenceGroupBase, - SequenceGroupMetadata, SequenceGroupOutput, - SequenceStatus) +from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, + PoolingSequenceGroupOutput, Sequence, SequenceGroup, + SequenceGroupBase, SequenceGroupMetadata, + SequenceGroupOutput, SequenceStatus) from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer) from vllm.transformers_utils.config import try_get_generation_config @@ -966,9 +965,9 @@ def has_unfinished_requests_for_virtual_engine( @staticmethod def _process_sequence_group_outputs( seq_group: SequenceGroup, - outputs: List[EmbeddingSequenceGroupOutput], + outputs: List[PoolingSequenceGroupOutput], ) -> None: - seq_group.embeddings = outputs[0].embeddings + seq_group.pooled_data = outputs[0].data for seq in seq_group.get_seqs(): seq.status = SequenceStatus.FINISHED_STOPPED @@ -1784,8 +1783,8 @@ def _get_stats(self, num_prompt_tokens_iter) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. - if model_output and (model_output[0].spec_decode_worker_metrics - is not None): + if model_output and isinstance(model_output[0], SamplerOutput) and ( + model_output[0].spec_decode_worker_metrics is not None): spec_decode_metrics = model_output[0].spec_decode_worker_metrics else: spec_decode_metrics = None diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0bec978c4869c..11b2574ce42dd 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -26,7 +26,9 @@ from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding.guided_fields import ( GuidedDecodingRequest, LLMGuidedOptions) -from vllm.outputs import PoolingRequestOutput, RequestOutput +from vllm.outputs import (ClassificationRequestOutput, EmbeddingRequestOutput, + PoolingRequestOutput, RequestOutput, + ScoringRequestOutput) from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, @@ -120,7 +122,7 @@ class LLM: serving, use the :class:`~vllm.AsyncLLMEngine` class instead. """ - DEPRECATE_LEGACY: ClassVar[bool] = False + DEPRECATE_LEGACY: ClassVar[bool] = True """A flag to toggle whether to deprecate the legacy generate/encode API.""" DEPRECATE_INIT_POSARGS: ClassVar[bool] = True @@ -257,11 +259,14 @@ def generate( self, prompts: Union[PromptType, Sequence[PromptType]], /, - *, sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, + *, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + guided_options_request: Optional[Union[LLMGuidedOptions, + GuidedDecodingRequest]] = None, ) -> List[RequestOutput]: ... @@ -275,6 +280,9 @@ def generate( prompt_token_ids: Optional[List[int]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + guided_options_request: Optional[Union[LLMGuidedOptions, + GuidedDecodingRequest]] = None, ) -> List[RequestOutput]: ... @@ -288,6 +296,9 @@ def generate( prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + guided_options_request: Optional[Union[LLMGuidedOptions, + GuidedDecodingRequest]] = None, ) -> List[RequestOutput]: ... @@ -302,6 +313,9 @@ def generate( prompt_token_ids: List[int], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + guided_options_request: Optional[Union[LLMGuidedOptions, + GuidedDecodingRequest]] = None, ) -> List[RequestOutput]: ... @@ -316,6 +330,9 @@ def generate( prompt_token_ids: List[List[int]], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + guided_options_request: Optional[Union[LLMGuidedOptions, + GuidedDecodingRequest]] = None, ) -> List[RequestOutput]: ... @@ -328,6 +345,9 @@ def generate( prompt_token_ids: Union[List[int], List[List[int]]], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + guided_options_request: Optional[Union[LLMGuidedOptions, + GuidedDecodingRequest]] = None, ) -> List[RequestOutput]: ... @@ -678,11 +698,12 @@ def encode( self, prompts: Union[PromptType, Sequence[PromptType]], /, - *, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, + *, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: ... @@ -696,6 +717,7 @@ def encode( prompt_token_ids: Optional[List[int]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: ... @@ -709,6 +731,7 @@ def encode( prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: ... @@ -723,6 +746,7 @@ def encode( prompt_token_ids: List[int], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: ... @@ -737,6 +761,7 @@ def encode( prompt_token_ids: List[List[int]], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: ... @@ -749,6 +774,7 @@ def encode( prompt_token_ids: Union[List[int], List[List[int]]], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: ... @@ -768,7 +794,8 @@ def encode( lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> List[PoolingRequestOutput]: - """Generates the completions for the input prompts. + """Apply pooling to the hidden states corresponding to the input + prompts. This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts @@ -787,7 +814,7 @@ def encode( Returns: A list of ``PoolingRequestOutput`` objects containing the - generated embeddings in the same order as the input prompts. + pooled hidden states in the same order as the input prompts. Note: Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is @@ -833,28 +860,110 @@ def encode( return self.engine_class.validate_outputs(outputs, PoolingRequestOutput) + def embed( + self, + prompts: Union[PromptType, Sequence[PromptType]], + /, + *, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[EmbeddingRequestOutput]: + """ + Generate an embedding vector for each prompt. + + This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See :class:`~vllm.inputs.PromptType` + for more details about the format of each prompts. + use_tqdm: Whether to use tqdm to display the progress bar. + lora_request: LoRA request to use for generation, if any. + prompt_adapter_request: Prompt Adapter request to use for + generation, if any. + + Returns: + A list of ``EmbeddingRequestOutput`` objects containing the + embedding vectors in the same order as the input prompts. + """ + if self.llm_engine.model_config.task != "embed": + raise ValueError( + "Embedding API is only enabled for `--task embed`") + + items = self.encode(prompts, + use_tqdm=use_tqdm, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + return [EmbeddingRequestOutput.from_base(item) for item in items] + + def classify( + self, + prompts: Union[PromptType, Sequence[PromptType]], + /, + *, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ClassificationRequestOutput]: + """ + Generate class logits for each prompt. + + This class automatically batches the given prompts, considering + the memory constraint. For the best performance, put all of your prompts + into a single list and pass it to this method. + + Args: + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See :class:`~vllm.inputs.PromptType` + for more details about the format of each prompts. + use_tqdm: Whether to use tqdm to display the progress bar. + lora_request: LoRA request to use for generation, if any. + prompt_adapter_request: Prompt Adapter request to use for + generation, if any. + + Returns: + A list of ``ClassificationRequestOutput`` objects containing the + embedding vectors in the same order as the input prompts. + """ + if self.llm_engine.model_config.task != "classify": + raise ValueError( + "Classification API is only enabled for `--task classify`") + + items = self.encode(prompts, + use_tqdm=use_tqdm, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + return [ClassificationRequestOutput.from_base(item) for item in items] + def score( self, text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]], /, + *, truncate_prompt_tokens: Optional[int] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[PoolingRequestOutput]: - """Generates similarity scores for all pairs . + ) -> List[ScoringRequestOutput]: + """Generate similarity scores for all pairs ````. - The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case - the text_1 sentence will be replicated N times to pair with the text_2 - sentences. The input pairs are used to build a list of prompts for the + The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. + In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N`` + times to pair with the ``text_2`` sentences. + The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all of your texts into a single list and pass it to this method. Args: text_1: can be a single prompt or a list of prompts, in which - case it has to have the same length as the text_2 list + case it has to have the same length as the ``text_2`` list text_2: The texts to pair with the query to form the input to the LLM. See :class:`~vllm.inputs.PromptType` for more details about the format of each prompts. @@ -864,7 +973,7 @@ def score( generation, if any. Returns: - A list of ``PoolingRequestOutput`` objects containing the + A list of ``ScoringRequestOutput`` objects containing the generated scores in the same order as the input prompts. """ runner_type = self.llm_engine.model_config.runner_type @@ -884,6 +993,8 @@ def score( if not self.llm_engine.model_config.is_cross_encoder: raise ValueError("Your model does not support cross encoding") + if self.llm_engine.model_config.task != "score": + raise ValueError("Score API is only enabled for `--task score`") tokenizer = self.llm_engine.get_tokenizer() @@ -954,8 +1065,10 @@ def ensure_str(prompt: SingletonPrompt): ) outputs = self._run_engine(use_tqdm=use_tqdm) - return self.engine_class.validate_outputs(outputs, - PoolingRequestOutput) + items = self.engine_class.validate_outputs(outputs, + PoolingRequestOutput) + + return [ScoringRequestOutput.from_base(item) for item in items] def start_profile(self) -> None: self.llm_engine.start_profile() diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index ee94a9413f098..34c9f0a96216f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -900,7 +900,7 @@ class EmbeddingResponse(OpenAIBaseModel): class ScoreResponseData(OpenAIBaseModel): index: int object: str = "score" - score: Union[List[float], str] + score: float class ScoreResponse(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 3f7b75e893cad..fd501ad4f833e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -18,14 +18,15 @@ ErrorResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.logger import init_logger -from vllm.outputs import PoolingOutput, PoolingRequestOutput +from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, + PoolingRequestOutput) from vllm.utils import merge_async_iterators logger = init_logger(__name__) def _get_embedding( - output: PoolingOutput, + output: EmbeddingOutput, encoding_format: Literal["float", "base64"], ) -> Union[List[float], str]: if encoding_format == "float": @@ -46,8 +47,10 @@ def request_output_to_embedding_response( data: List[EmbeddingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): + embedding_res = EmbeddingRequestOutput.from_base(final_res) prompt_token_ids = final_res.prompt_token_ids - embedding = _get_embedding(final_res.outputs, encoding_format) + + embedding = _get_embedding(embedding_res.outputs, encoding_format) embedding_data = EmbeddingResponseData(index=idx, embedding=embedding) data.append(embedding_data) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d5ad4354c78be..5b6a089e4c319 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -31,7 +31,7 @@ ErrorResponse, LoadLoraAdapterRequest, ModelCard, ModelList, - ModelPermission, + ModelPermission, ScoreRequest, TokenizeChatRequest, TokenizeCompletionRequest, UnloadLoraAdapterRequest) @@ -73,7 +73,7 @@ class LoRAModulePath: CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, - EmbeddingCompletionRequest, + EmbeddingCompletionRequest, ScoreRequest, TokenizeCompletionRequest] ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest, @@ -567,12 +567,14 @@ async def _get_trace_headers( return None @staticmethod - def _base_request_id(raw_request: Request, + def _base_request_id(raw_request: Optional[Request], default: Optional[str] = None) -> Optional[str]: """Pulls the request id to use from a header, if provided""" default = default or random_uuid() - return raw_request.headers.get( - "X-Request-Id", default) if raw_request is not None else default + if raw_request is None: + return default + + return raw_request.headers.get("X-Request-Id", default) @staticmethod def _get_decoded_token(logprob: Logprob, diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 4929e720c00e4..6f5cc14ac37cc 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger -from vllm.outputs import PoolingRequestOutput +from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import make_async, merge_async_iterators @@ -24,13 +24,13 @@ def request_output_to_score_response( final_res_batch: List[PoolingRequestOutput], request_id: str, created_time: int, model_name: str) -> ScoreResponse: data: List[ScoreResponseData] = [] - score = None num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): - if final_res is not None: - score = final_res.outputs.embedding - score_data = ScoreResponseData(index=idx, score=score) - data.append(score_data) + classify_res = ScoringRequestOutput.from_base(final_res) + + score_data = ScoreResponseData(index=idx, + score=classify_res.outputs.score) + data.append(score_data) usage = UsageInfo( prompt_tokens=num_prompt_tokens, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index e0d42e30ebef3..75bf33dc70a51 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,14 +1,16 @@ from enum import IntEnum -from typing import List, Optional +from typing import List, Optional, Union import torch import torch.nn as nn +import torch.nn.functional as F from transformers import PretrainedConfig +from typing_extensions import assert_never from vllm.config import PoolerConfig from vllm.model_executor.pooling_metadata import (PoolingMetadata, PoolingTensors) -from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput +from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) @@ -22,7 +24,7 @@ class PoolingType(IntEnum): MEAN = 4 -class Pooler(nn.Module): +class SimplePooler(nn.Module): """A layer that pools specific information from hidden states. This layer does the following: @@ -35,21 +37,203 @@ class Pooler(nn.Module): normalize: Whether to normalize the pooled data. """ + @staticmethod + def from_pooling_type( + pooling_type: PoolingType, + *, + normalize: bool, + softmax: bool, + step_tag_id: Optional[int] = None, + returned_token_ids: Optional[List[int]] = None, + ) -> "SimplePooler": + if pooling_type == PoolingType.LAST: + assert step_tag_id is None and returned_token_ids is None + return LastPool(normalize=normalize, softmax=softmax) + if pooling_type == PoolingType.ALL: + assert step_tag_id is None and returned_token_ids is None + return AllPool(normalize=normalize, softmax=softmax) + if pooling_type == PoolingType.CLS: + assert step_tag_id is None and returned_token_ids is None + return CLSPool(normalize=normalize, softmax=softmax) + if pooling_type == PoolingType.MEAN: + assert step_tag_id is None and returned_token_ids is None + return MeanPool(normalize=normalize, softmax=softmax) + if pooling_type == PoolingType.STEP: + return StepPool(normalize=normalize, + softmax=softmax, + step_tag_id=step_tag_id, + returned_token_ids=returned_token_ids) + + assert_never(pooling_type) + + def __init__(self, *, normalize: bool, softmax: bool) -> None: + super().__init__() + + self.head = PoolerHead(normalize=normalize, softmax=softmax) + + def get_prompt_lens( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> torch.Tensor: + return PoolingTensors.from_pooling_metadata( + pooling_metadata, hidden_states.device).prompt_lens + + def extract_states( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Union[list[torch.Tensor], torch.Tensor]: + raise NotImplementedError + + def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput: + return PoolingSequenceGroupOutput(data) + + def forward( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + pooled_data = self.extract_states(hidden_states, pooling_metadata) + pooled_data = self.head(pooled_data) + pooled_outputs = [self.build_output(data) for data in pooled_data] + return PoolerOutput(outputs=pooled_outputs) + + +class CLSPool(SimplePooler): + + def extract_states( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Union[list[torch.Tensor], torch.Tensor]: + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + + first_token_flat_indices = torch.zeros_like(prompt_lens) + first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1] + return hidden_states[first_token_flat_indices] + + +class LastPool(SimplePooler): + + def extract_states( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Union[list[torch.Tensor], torch.Tensor]: + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + + last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1 + return hidden_states[last_token_flat_indices] + + +class AllPool(SimplePooler): + + def extract_states( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Union[list[torch.Tensor], torch.Tensor]: + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + + offset = 0 + pooled_data = list[torch.Tensor]() + for prompt_len in prompt_lens: + pooled_data.append(hidden_states[offset:offset + prompt_len]) + offset += prompt_len + + return pooled_data + + +class MeanPool(SimplePooler): + + def extract_states( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Union[list[torch.Tensor], torch.Tensor]: + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + + cumsum = torch.cumsum(hidden_states, dim=0) + start_indices = torch.cat([ + torch.tensor([0], device=hidden_states.device), + torch.cumsum(prompt_lens[:-1], dim=0) + ]) + end_indices = torch.cumsum(prompt_lens, dim=0) + return (cumsum[end_indices - 1] - cumsum[start_indices] + + hidden_states[start_indices]) / prompt_lens.unsqueeze(1) + + +class StepPool(SimplePooler): + def __init__( self, - pooling_type: PoolingType, + *, normalize: bool, softmax: bool, step_tag_id: Optional[int] = None, returned_token_ids: Optional[List[int]] = None, ): + super().__init__(normalize=normalize, softmax=softmax) + + self.step_tag_id = step_tag_id + self.returned_token_ids = returned_token_ids + + def extract_states( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Union[list[torch.Tensor], torch.Tensor]: + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + + returned_token_ids = self.returned_token_ids + if returned_token_ids is not None and len(returned_token_ids) > 0: + hidden_states = hidden_states[:, returned_token_ids] + + step_tag_id = self.step_tag_id + + offset = 0 + pooled_data = list[torch.Tensor]() + for prompt_len, seq_data_i in zip(prompt_lens, + pooling_metadata.seq_data.values()): + pooled_data_i = hidden_states[offset:offset + prompt_len] + if step_tag_id is not None: + token_ids = torch.tensor(seq_data_i.prompt_token_ids) + pooled_data_i = pooled_data_i[token_ids == step_tag_id] + + offset += prompt_len + pooled_data.append(pooled_data_i) + + return pooled_data + + +class PoolerHead(nn.Module): + + def __init__(self, *, normalize: bool, softmax: bool) -> None: super().__init__() - self.pooling_type = pooling_type self.normalize = normalize self.softmax = softmax - self.step_tag_id = step_tag_id - self.returned_token_ids = returned_token_ids + + def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor]): + if self.normalize: + if isinstance(pooled_data, list): + pooled_data = [ + F.normalize(data, p=2, dim=1) for data in pooled_data + ] + else: + pooled_data = F.normalize(pooled_data, p=2, dim=1) + + if self.softmax: + if isinstance(pooled_data, list): + pooled_data = [F.softmax(data, dim=-1) for data in pooled_data] + else: + pooled_data = F.softmax(pooled_data, dim=-1) + + return pooled_data + + +class Pooler(nn.Module): @classmethod def from_config_with_defaults( @@ -60,8 +244,8 @@ def from_config_with_defaults( softmax: bool, step_tag_id: Optional[int] = None, returned_token_ids: Optional[List[int]] = None, - ) -> "Pooler": - return cls( + ) -> SimplePooler: + return SimplePooler.from_pooling_type( pooling_type=PoolingType[pooler_config.pooling_type] if pooler_config.pooling_type is not None else pooling_type, normalize=pooler_config.normalize @@ -75,85 +259,6 @@ def from_config_with_defaults( returned_token_ids, ) - def forward( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> PoolerOutput: - """Pools specific information from hidden states based on metadata.""" - - prompt_lens = PoolingTensors.from_pooling_metadata( - pooling_metadata, hidden_states.device).prompt_lens - - if self.pooling_type is PoolingType.CLS: - first_token_flat_indices = torch.zeros_like(prompt_lens) - first_token_flat_indices[1:] += torch.cumsum(prompt_lens, - dim=0)[:-1] - pooled_data = hidden_states[first_token_flat_indices] - elif self.pooling_type == PoolingType.LAST: - last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1 - pooled_data = hidden_states[last_token_flat_indices] - elif self.pooling_type == PoolingType.ALL: - offset = 0 - pooled_data = [] - for prompt_len in prompt_lens: - pooled_data.append(hidden_states[offset:offset + prompt_len]) - offset += prompt_len - elif self.pooling_type == PoolingType.MEAN: - # Calculate mean pooling - cumsum = torch.cumsum(hidden_states, dim=0) - start_indices = torch.cat([ - torch.tensor([0], device=hidden_states.device), - torch.cumsum(prompt_lens[:-1], dim=0) - ]) - end_indices = torch.cumsum(prompt_lens, dim=0) - pooled_data = ( - cumsum[end_indices - 1] - cumsum[start_indices] + - hidden_states[start_indices]) / prompt_lens.unsqueeze(1) - elif self.pooling_type == PoolingType.STEP: - returned_token_ids = self.returned_token_ids - if returned_token_ids is not None and len(returned_token_ids) > 0: - hidden_states = hidden_states[:, returned_token_ids] - - step_tag_id = self.step_tag_id - - offset = 0 - pooled_data = [] - for prompt_len, seq_data_i in zip( - prompt_lens, pooling_metadata.seq_data.values()): - pooled_data_i = hidden_states[offset:offset + prompt_len] - if step_tag_id is not None: - token_ids = torch.tensor(seq_data_i.prompt_token_ids) - pooled_data_i = pooled_data_i[token_ids == step_tag_id] - - offset += prompt_len - pooled_data.append(pooled_data_i) - else: - raise ValueError(f"Invalid pooling type: {self.pooling_type}") - - if self.normalize: - if isinstance(pooled_data, list): - pooled_data = [ - nn.functional.normalize(data, p=2, dim=1) - for data in pooled_data - ] - else: - pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1) - - if self.softmax: - if isinstance(pooled_data, list): - pooled_data = [ - nn.functional.softmax(data, dim=-1) for data in pooled_data - ] - else: - pooled_data = nn.functional.softmax(pooled_data, dim=-1) - - pooled_outputs = [ - EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data - ] - - return PoolerOutput(outputs=pooled_outputs) - class CrossEncodingPooler(nn.Module): """A layer that pools specific information from hidden states. @@ -208,9 +313,8 @@ def forward( if self.pooler is not None: # apply classifier once on the full batch if possible pooled_output = self.classifier(pooled_output) - logits = self.default_activation_function(pooled_output) - pooled_outputs = [ - EmbeddingSequenceGroupOutput(data.tolist()) for data in logits - ] + scores = self.default_activation_function(pooled_output).squeeze(-1) + + pooled_outputs = [PoolingSequenceGroupOutput(data) for data in scores] return PoolerOutput(outputs=pooled_outputs) diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 34c1332ac4a66..d179d6235424a 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -2,19 +2,20 @@ from typing import List, Optional, Union import torch -from torch import nn +import torch.nn as nn from xformers.ops.fmha.attn_bias import BlockDiagonalMask from vllm.attention import AttentionMetadata from vllm.attention.backends.xformers import XFormersImpl from vllm.config import ModelConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.pooler import PoolerHead from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.pooling_metadata import (PoolingMetadata, PoolingTensors) from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import (EmbeddingSequenceGroupOutput, IntermediateTensors, - PoolerOutput) +from vllm.sequence import (IntermediateTensors, PoolerOutput, + PoolingSequenceGroupOutput) logger = init_logger(__name__) @@ -52,6 +53,8 @@ def tokens_to_ids(tokens: list[str]) -> array: self.embed_pattern_ids = tokens_to_ids( ["▁<", "|", "embed", "|", ">", "<0x0A>"]) + self.head = PoolerHead(normalize=True, softmax=False) + def _find_array(self, arr: array, target: array, start_idx: int) -> int: """ Find the first occurrence of target in arr starting from start_idx. @@ -75,7 +78,7 @@ def _find_array(self, arr: array, target: array, start_idx: int) -> int: return i return -1 - def _get_instruction_len(self, prompt_token_ids: array) -> bool: + def _get_instruction_len(self, prompt_token_ids: array) -> int: """ Get the length of the instruction in the prompt. @@ -168,10 +171,10 @@ def forward( mean_embeddings = sum_embeddings / num_non_instruction_tokens.unsqueeze( 1) - pooled_data = nn.functional.normalize(mean_embeddings, p=2, dim=1) + pooled_data = self.head(mean_embeddings) pooled_outputs = [ - EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data + PoolingSequenceGroupOutput(data) for data in pooled_data ] return PoolerOutput(outputs=pooled_outputs) diff --git a/vllm/outputs.py b/vllm/outputs.py index 86264f604f6bc..8c6c1aca3a917 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,9 +1,13 @@ import time +import warnings from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Dict, Generic, List, Optional from typing import Sequence as GenericSequence from typing import Union +import torch +from typing_extensions import TypeVar + from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind @@ -57,14 +61,26 @@ class PoolingOutput: """The output data of one pooling output of a request. Args: - embedding: The embedding vector, which is a list of floats. The - length of vector depends on the model as listed in the embedding guide. + data: The extracted hidden states. """ - embedding: List[float] + data: torch.Tensor def __repr__(self) -> str: - return (f"PoolingOutput(" - f"embedding={len(self.embedding)})") + return (f"PoolingOutput(data={self.data})") + + def __eq__(self, other: object) -> bool: + return (isinstance(other, self.__class__) and bool( + (self.data == other.data).all())) + + @property + def embedding(self) -> list[float]: + msg = ("`LLM.encode()` now returns raw outputs. " + "To return embeddings, use `LLM.embed()`. " + "To return class probabilities, use `LLM.classify()` " + "and access the `probs` attribute. ") + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + return self.data.tolist() class RequestOutput: @@ -316,7 +332,10 @@ def __repr__(self) -> str: f"multi_modal_placeholders={self.multi_modal_placeholders})") -class PoolingRequestOutput: +_O = TypeVar("_O", default=PoolingOutput) + + +class PoolingRequestOutput(Generic[_O]): """ The output data of a pooling request to the LLM. @@ -327,24 +346,24 @@ class PoolingRequestOutput: finished (bool): A flag indicating whether the pooling is completed. """ - def __init__(self, request_id: str, outputs: "PoolingOutput", + def __init__(self, request_id: str, outputs: _O, prompt_token_ids: List[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids self.finished = finished self.outputs = outputs - @classmethod - def from_seq_group(cls, - seq_group: 'SequenceGroup') -> "PoolingRequestOutput": - if seq_group.embeddings is None: - raise ValueError( - "Embeddings are missing in seq_group for EmbeddingRequest.") - output = PoolingOutput(seq_group.embeddings) + @staticmethod + def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": + pooled_data = seq_group.pooled_data + assert pooled_data is not None + + output = PoolingOutput(pooled_data) prompt_token_ids = seq_group.prompt_token_ids finished = seq_group.is_finished() - return cls(seq_group.request_id, output, prompt_token_ids, finished) + return PoolingRequestOutput(seq_group.request_id, output, + prompt_token_ids, finished) def __repr__(self): """ @@ -356,89 +375,137 @@ def __repr__(self): Returns: str: A string representation of the PoolingRequestOutput instance. """ - return (f"PoolingRequestOutput(request_id='{self.request_id}', " - f"outputs={repr(self.outputs)}, " + return (f"{type(self).__name__}(request_id={self.request_id!r}, " + f"outputs={self.outputs!r}, " f"prompt_token_ids={self.prompt_token_ids}, " f"finished={self.finished})") +class RequestOutputFactory: + + @staticmethod + def create(seq_group: SequenceGroup, + seq_id_to_seq_group: Dict[str, SequenceGroupBase], + use_cache: bool = False): + if seq_group.pooled_data is not None: + return PoolingRequestOutput.from_seq_group(seq_group) + else: + return RequestOutput.from_seq_group(seq_group, use_cache, + seq_id_to_seq_group) + + @dataclass -class ScoreOutput: - """The output data of one completion output of a request. +class EmbeddingOutput: + """The output data of one embedding output of a request. Args: - score: The score, which is a list of floats. - index: The correspondent text index of the score. + embedding: The embedding vector, which is a list of floats. + Its length depends on the hidden dimension of the model. """ - index: int - score: List[float] + embedding: list[float] + + @staticmethod + def from_base(pooling_output: PoolingOutput): + pooled_data = pooling_output.data + if pooled_data.ndim != 1: + raise ValueError("pooled_data should be a 1-D embedding vector") + + return EmbeddingOutput(pooled_data.tolist()) + + @property + def hidden_size(self) -> int: + return len(self.embedding) def __repr__(self) -> str: - return (f"ScoreOutput(" - f"score={self.score}), " - f"index={self.index})") + return f"EmbeddingOutput(hidden_size={self.hidden_size})" -class ScoreRequestOutput: - """ - The output data of an score request to the LLM. +class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]): + + @staticmethod + def from_base(request_output: PoolingRequestOutput): + return EmbeddingRequestOutput( + request_id=request_output.request_id, + outputs=EmbeddingOutput.from_base(request_output.outputs), + prompt_token_ids=request_output.prompt_token_ids, + finished=request_output.finished, + ) + + +@dataclass +class ClassificationOutput: + """The output data of one classification output of a request. Args: - request_id (str): A unique identifier for the score request. - outputs (score): The embedding results for the given input. + probs: The probability vector, which is a list of floats. + Its length depends on the number of classes. """ + probs: list[float] - def __init__(self, request_id: str, outputs: "ScoreOutput"): - self.request_id = request_id - self.outputs = outputs + @staticmethod + def from_base(pooling_output: PoolingOutput): + pooled_data = pooling_output.data + if pooled_data.ndim != 1: + raise ValueError("pooled_data should be a 1-D probability vector") - def __repr__(self): - """ - Returns a string representation of an ScoreRequestOutput instance. + return ClassificationOutput(pooled_data.tolist()) - The representation includes the request_id and the number of outputs, - providing a quick overview of the embedding request's results. + @property + def num_classes(self) -> int: + return len(self.probs) - Returns: - str: A string representation of the ScoreRequestOutput instance. - """ - return (f"ScoreRequestOutput(request_id='{self.request_id}', " - f"outputs={repr(self.outputs)}") + def __repr__(self) -> str: + return f"ClassificationOutput(num_classes={self.num_classes})" -class RequestOutputFactory: +class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]): @staticmethod - def create(seq_group: SequenceGroup, - seq_id_to_seq_group: Dict[str, SequenceGroupBase], - use_cache: bool = False): - # Determine the type based on a condition, for example: - if hasattr(seq_group, - 'embeddings') and seq_group.embeddings is not None: - return PoolingRequestOutput.from_seq_group(seq_group) - else: - return RequestOutput.from_seq_group(seq_group, use_cache, - seq_id_to_seq_group) + def from_base(request_output: PoolingRequestOutput): + return ClassificationRequestOutput( + request_id=request_output.request_id, + outputs=ClassificationOutput.from_base(request_output.outputs), + prompt_token_ids=request_output.prompt_token_ids, + finished=request_output.finished, + ) -def __getattr__(name: str): - import warnings +@dataclass +class ScoringOutput: + """The output data of one scoring output of a request. - if name == "EmbeddingOutput": - msg = ("EmbeddingOutput has been renamed to PoolingOutput. " - "The original name will be removed in an upcoming version.") + Args: + score: The similarity score, which is a scalar value. + """ + score: float + + @staticmethod + def from_base(pooling_output: PoolingOutput): + pooled_data = pooling_output.data + if pooled_data.ndim != 0: + raise ValueError("pooled_data should be a scalar score") - warnings.warn(DeprecationWarning(msg), stacklevel=2) + return ScoringOutput(pooled_data.item()) - return PoolingOutput + def __repr__(self) -> str: + return f"ScoringOutput(score={self.score})" - if name == "EmbeddingRequestOutput": - msg = ("EmbeddingRequestOutput has been renamed to " - "PoolingRequestOutput. " - "The original name will be removed in an upcoming version.") + @property + def embedding(self) -> list[float]: + msg = ("`LLM.score()` now returns scalar scores. " + "Please access it via the `score` attribute. ") + warnings.warn(msg, DeprecationWarning, stacklevel=2) - warnings.warn(DeprecationWarning(msg), stacklevel=2) + return [self.score] - return PoolingRequestOutput - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]): + + @staticmethod + def from_base(request_output: PoolingRequestOutput): + return ScoringRequestOutput( + request_id=request_output.request_id, + outputs=ScoringOutput.from_base(request_output.outputs), + prompt_token_ids=request_output.prompt_token_ids, + finished=request_output.finished, + ) diff --git a/vllm/sequence.py b/vllm/sequence.py index b0f3c1cc3609f..ddb9ca5944f10 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -617,10 +617,9 @@ class SequenceGroup: sampling_params: The sampling parameters used to generate the outputs. arrival_time: The arrival time of the request. lora_request: LoRA request. - embeddings: The embeddings vectors of the prompt of the sequence group - for a pooling model. - pooling_params: The pooling parameters used to generate the pooling + pooling_params: The parameters used to generate the pooler for a pooling model. + pooled_data: The extracted hidden states from a pooling model. encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. trace_headers: OpenTelemetry trace headers. @@ -635,8 +634,8 @@ def __init__( arrival_time: float, sampling_params: Optional[SamplingParams] = None, lora_request: Optional[LoRARequest] = None, - embeddings: Optional[List[float]] = None, pooling_params: Optional[PoolingParams] = None, + pooled_data: Optional[torch.Tensor] = None, encoder_seq: Optional[Sequence] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -658,8 +657,8 @@ def __init__( self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() - self.embeddings = embeddings self.pooling_params = pooling_params + self.pooled_data = pooled_data self.prompt_adapter_request = prompt_adapter_request self.encoder_seq = encoder_seq self.trace_headers = trace_headers @@ -1033,8 +1032,8 @@ class CompletionSequenceGroupOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - __metaclass__ = SequenceGroupOutput """The model output associated with a completion sequence group.""" + __metaclass__ = SequenceGroupOutput samples: List[SequenceOutput] # Prompt logprob for each prompt query token. prompt_logprobs: Optional[PromptLogprobs] @@ -1050,23 +1049,24 @@ def __eq__(self, other: object) -> bool: and self.prompt_logprobs == other.prompt_logprobs) -class EmbeddingSequenceGroupOutput( +class PoolingSequenceGroupOutput( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True, # type: ignore[call-arg] ): - """The model output associated with an embedding sequence group.""" + """The model output associated with a pooling sequence group.""" __metaclass__ = SequenceGroupOutput - embeddings: List[int] + # Annotated as Any to be compatible with msgspec + # The actual type is in SequenceGroup.pooled_data + data: Any def __repr__(self) -> str: - return (f"EmbeddingSequenceGroupOutput(" - f"embeddings_shape={len(self.embeddings)})") + return f"PoolingSequenceGroupOutput(data={self.data}" def __eq__(self, other: object) -> bool: - if not isinstance(other, EmbeddingSequenceGroupOutput): + if not isinstance(other, PoolingSequenceGroupOutput): raise NotImplementedError() - return self.embeddings == other.embeddings + return self.data == other.data # cannot use msgspec.Struct here because Dynamo does not support it @@ -1085,7 +1085,7 @@ def __getitem__(self, key: Union[str, slice]): elif isinstance(key, slice): return self.__class__({k: v[key] for k, v in self.tensors.items()}) - def __setitem__(self, key: str, value): + def __setitem__(self, key: str, value: torch.Tensor): self.tensors[key] = value def __len__(self): @@ -1103,16 +1103,12 @@ class PoolerOutput( omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] """The output from a pooling operation in the pooling model.""" - outputs: List[EmbeddingSequenceGroupOutput] - - # lazy import to avoid circular import - from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics - spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None + outputs: List[PoolingSequenceGroupOutput] - def __getitem__(self, idx: int) -> EmbeddingSequenceGroupOutput: + def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput: return self.outputs[idx] - def __setitem__(self, idx: int, value): + def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput): self.outputs[idx] = value def __len__(self): @@ -1385,8 +1381,8 @@ def add_request(request_id: str, engine, params, **kwargs): arrival_time=seq_group.arrival_time, sampling_params=original_params, lora_request=seq_group.lora_request, - embeddings=seq_group.embeddings, pooling_params=seq_group.pooling_params, + pooled_data=seq_group.pooled_data, encoder_seq=seq_group.encoder_seq, trace_headers=seq_group.trace_headers, prompt_adapter_request=seq_group.prompt_adapter_request, From 969da7d70bc0539f6be12027b71bef758325a61a Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Fri, 13 Dec 2024 03:09:30 -0800 Subject: [PATCH 11/56] [V1][VLM] Fix edge case bug for InternVL2 (#11165) Signed-off-by: Roger Wang --- vllm/model_executor/models/internvl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 42c769f79e202..f4b7e4478c164 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -669,8 +669,11 @@ def _process_image_input( image_embeds = self.extract_feature(image_input["data"]) patches_per_image = image_input["patches_per_image"] + + # Only one image in the current batch if len(patches_per_image) == 1: - image_embeds = image_embeds.unsqueeze(0) + image_embeds = image_embeds.view( + -1, self.config.text_config.hidden_size).unsqueeze(0) return image_embeds # NOTE: Image embeddings are split into separate tensors for each image From d1fa714cb1c9a708d7da0de27c99f7eee07fe663 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Fri, 13 Dec 2024 21:39:00 +0800 Subject: [PATCH 12/56] [Refactor]A simple device-related refactor (#11163) Signed-off-by: noemotiovon Co-authored-by: noemotiovon --- vllm/platforms/cpu.py | 5 +++++ vllm/platforms/hpu.py | 9 +++++++++ vllm/platforms/interface.py | 17 +++++++++++++++++ vllm/platforms/neuron.py | 9 +++++++++ vllm/platforms/openvino.py | 10 +++++----- vllm/platforms/xpu.py | 5 +++++ vllm/utils.py | 27 +-------------------------- 7 files changed, 51 insertions(+), 31 deletions(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index e5142b985d1f2..aad8755d9fcd8 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "vllm.worker.cpu_worker.CPUWorker" else: parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on CPU.") + return False diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 7f22bee3eaa74..2b947d280f9f8 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -2,6 +2,8 @@ import torch +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: @@ -9,6 +11,8 @@ else: VllmConfig = None +logger = init_logger(__name__) + class HpuPlatform(Platform): _enum = PlatformEnum.HPU @@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config = vllm_config.parallel_config if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + + @classmethod + def is_pin_memory_available(cls): + logger.warning("Pin memory is not supported on HPU.") + return False diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index db06d2c18e681..4150b0cdf836a 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,6 +1,7 @@ import enum import platform import random +from platform import uname from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union import numpy as np @@ -16,6 +17,11 @@ logger = init_logger(__name__) +def in_wsl() -> bool: + # Reference: https://github.com/microsoft/WSL/issues/4071 + return "microsoft" in " ".join(uname()).lower() + + class _Backend(enum.Enum): FLASH_ATTN = enum.auto() FLASH_ATTN_VLLM_V1 = enum.auto() @@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum: return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN + @classmethod + def is_pin_memory_available(cls) -> bool: + """Checks whether pin memory is available on the current platform.""" + if in_wsl(): + # Pinning memory in WSL is not supported. + # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications + logger.warning("Using 'pin_memory=False' as WSL is detected. " + "This may slow down the performance.") + return False + return True + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 1e5c4bddfa24f..86113523385f6 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,5 +1,7 @@ from typing import TYPE_CHECKING, Optional +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum if TYPE_CHECKING: @@ -7,6 +9,8 @@ else: VllmConfig = None +logger = init_logger(__name__) + class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON @@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": parallel_config.worker_cls = \ "vllm.worker.neuron_worker.NeuronWorker" + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on Neuron.") + return False diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index e0f8e8b4b49fe..ccd94e8adb3b1 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -34,7 +34,7 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: return _Backend.OPENVINO @classmethod - def get_device_name(self, device_id: int = 0) -> str: + def get_device_name(cls, device_id: int = 0) -> str: return "openvino" @classmethod @@ -42,19 +42,19 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return False @classmethod - def inference_mode(self): + def inference_mode(cls): return torch.inference_mode(mode=True) @classmethod - def is_openvino_cpu(self) -> bool: + def is_openvino_cpu(cls) -> bool: return "CPU" in envs.VLLM_OPENVINO_DEVICE @classmethod - def is_openvino_gpu(self) -> bool: + def is_openvino_gpu(cls) -> bool: return "GPU" in envs.VLLM_OPENVINO_DEVICE @classmethod - def is_pin_memory_available(self) -> bool: + def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on OpenViNO.") return False diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 11dbd04d55671..c20190e789d7e 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend = "ray" if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + + @classmethod + def is_pin_memory_available(cls): + logger.warning("Pin memory is not supported on XPU.") + return False diff --git a/vllm/utils.py b/vllm/utils.py index 1882264c19775..fbc3ef7fa7f89 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -24,7 +24,6 @@ from collections import UserDict, defaultdict from collections.abc import Iterable, Mapping from functools import lru_cache, partial, wraps -from platform import uname from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Dict, Generic, Hashable, List, Literal, Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union, overload) @@ -344,12 +343,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -@lru_cache(maxsize=None) -def in_wsl() -> bool: - # Reference: https://github.com/microsoft/WSL/issues/4071 - return "microsoft" in " ".join(uname()).lower() - - def make_async( func: Callable[P, T], executor: Optional[concurrent.futures.Executor] = None @@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None: @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: - - if in_wsl(): - # Pinning memory in WSL is not supported. - # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications - print_warning_once("Using 'pin_memory=False' as WSL is detected. " - "This may slow down the performance.") - return False - elif current_platform.is_xpu(): - print_warning_once("Pin memory is not supported on XPU.") - return False - elif current_platform.is_neuron(): - print_warning_once("Pin memory is not supported on Neuron.") - return False - elif current_platform.is_hpu(): - print_warning_once("Pin memory is not supported on HPU.") - return False - elif current_platform.is_cpu() or current_platform.is_openvino(): - return False - return True + return current_platform.is_pin_memory_available() class DeviceMemoryProfiler: From c31d4a57a6b639900a7c70b6e844db0116c2f9f6 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sat, 14 Dec 2024 00:51:25 +0900 Subject: [PATCH 13/56] [Core] support LoRA and prompt adapter in content-based hashing for Block Manager v2 prefix caching (#8240) --- tests/core/block/test_prefix_caching_block.py | 65 ++++++++++++++++++- tests/core/utils.py | 10 +++ vllm/core/block/block_table.py | 46 +++++++++---- vllm/core/block/common.py | 19 ++++-- vllm/core/block/cpu_gpu_block_allocator.py | 43 ++++++++---- vllm/core/block/interfaces.py | 32 ++++++--- vllm/core/block/naive_block.py | 10 ++- vllm/core/block/prefix_caching_block.py | 55 ++++++++++++---- vllm/core/block_manager.py | 8 ++- vllm/sequence.py | 13 ++++ 10 files changed, 246 insertions(+), 55 deletions(-) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bbeb4b3a58f2a..29ac3a3c86cb4 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -5,7 +5,7 @@ import pytest -from tests.core.utils import create_dummy_sequence +from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.core.block.interfaces import Block, BlockAllocator from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker, @@ -801,6 +801,7 @@ def create_immutable_chain( block_size: int, token_ids: List[int], allocator: PrefixCachingBlockAllocator, + extra_hash: Optional[int] = None, ) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ @@ -816,7 +817,9 @@ def create_immutable_chain( block_size:(block_number + 1) * block_size] prev_block = allocator.allocate_immutable_block( - prev_block=prev_block, token_ids=block_token_ids) + prev_block=prev_block, + token_ids=block_token_ids, + extra_hash=extra_hash) blocks.append(prev_block) return blocks @@ -931,3 +934,61 @@ def test_correct_block_hash(): allocator.mark_blocks_as_computed([]) assert tracker.get_num_cached_tokens(seq) == len(tokens) + + @staticmethod + def test_correct_extra_hash(): + """ + Test that the block hash is correctly computed based on the extra hash, + ensuring it matches the allocator's block hash, specifically for the + LoRA case, and that the correct number of cached tokens is retrieved. + """ + block_size = 4 + allocator = CpuGpuBlockAllocator.create( + allocator_type="prefix_caching", + num_gpu_blocks=16, + num_cpu_blocks=16, + block_size=block_size, + ) + gpu_allocator = allocator._allocators[Device.GPU] + + tracker = ComputedBlocksTracker( + allocator=allocator, + block_size=block_size, + enable_caching=True, + ) + + tokens = list(range(block_size * 4)) + + # Create a dummy LoRA sequence with a specific LoRA ID. + lora_seq = create_dummy_lora_sequence(request_id=0, + token_ids=tokens, + block_size=block_size, + lora_int_id=1) + + _ = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=tokens, + allocator=gpu_allocator, + extra_hash=lora_seq.extra_hash(), + ) + + allocator.mark_blocks_as_computed([]) + + # Create different dummy sequences that have the same token IDs + # but different LoRA IDs. + seq = create_dummy_sequence(request_id=1, + token_ids=tokens, + block_size=block_size) + + different_lora_seq = create_dummy_lora_sequence(request_id=2, + token_ids=tokens, + block_size=block_size, + lora_int_id=2) + + # Due to the different LoRA IDs, corresponding blocks are not cached. + assert tracker.get_num_cached_tokens(seq) == 0 + assert tracker.get_num_cached_tokens(different_lora_seq) == 0 + + # The number of cached tokens matches the length of the tokens + # for the cached LoRA sequence. + assert tracker.get_num_cached_tokens(lora_seq) == len(tokens) diff --git a/tests/core/utils.py b/tests/core/utils.py index 277368b57b938..16703cd19fa1e 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -46,6 +46,16 @@ def create_dummy_prompt( return prompt, seq_group +def create_dummy_lora_sequence(request_id: int, token_ids: List[int], + block_size: int, lora_int_id: int) -> Sequence: + return Sequence(seq_id=request_id, + inputs=token_inputs(token_ids), + block_size=block_size, + lora_request=LoRARequest(lora_name="dummy", + lora_path="/dummy", + lora_int_id=lora_int_id)) + + def create_dummy_sequence(request_id: int, token_ids: List[int], block_size: int) -> Sequence: return Sequence( diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index d10cb29ef4a7c..dca0b3fe8d304 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -80,7 +80,8 @@ def get_num_required_blocks(token_ids: List[int], def allocate(self, token_ids: List[int], - device: Device = Device.GPU) -> None: + device: Device = Device.GPU, + extra_hash: Optional[int] = None) -> None: """Allocates memory blocks for storing the given sequence of token IDs. This method allocates the required number of blocks to store the given @@ -90,12 +91,16 @@ def allocate(self, token_ids (List[int]): The sequence of token IDs to be stored. device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. + extra_hash (Optional[int]): The hash value of additional + factors, such as adapters, that influence the block hash + in the prefixcaching block. """ assert not self._is_allocated assert token_ids blocks = self._allocate_blocks_for_token_ids(prev_block=None, token_ids=token_ids, - device=device) + device=device, + extra_hash=extra_hash) self.update(blocks) self._num_full_slots = len(token_ids) @@ -108,7 +113,8 @@ def update(self, blocks: List[Block]) -> None: def append_token_ids(self, token_ids: List[int], num_lookahead_slots: int = 0, - num_computed_slots: Optional[int] = None) -> None: + num_computed_slots: Optional[int] = None, + extra_hash: Optional[int] = None) -> None: """Appends a sequence of token IDs to the existing blocks in the BlockTable. @@ -130,6 +136,9 @@ def append_token_ids(self, Without sliding window, None can be passed. Without chunked prefill, it should be the same as _num_full_slots. + extra_hash (Optional[int]): The hash value of additional + factors such as adapters that influence the block, apart + from the token_ids. """ assert self._is_allocated, "no blocks have been allocated" assert len(self._blocks) > 0 @@ -149,7 +158,8 @@ def append_token_ids(self, # Ensure there are enough empty slots for the new tokens plus # lookahead slots self.ensure_num_empty_slots(num_empty_slots=len(token_ids) + - num_lookahead_slots) + num_lookahead_slots, + extra_hash=extra_hash) # Update the blocks with the new tokens first_block_idx = self._num_full_slots // self._block_size @@ -160,7 +170,9 @@ def append_token_ids(self, self._num_full_slots += len(token_ids) - def ensure_num_empty_slots(self, num_empty_slots: int) -> None: + def ensure_num_empty_slots(self, + num_empty_slots: int, + extra_hash: Optional[int] = None) -> None: """Ensures that the BlockTable has at least the specified number of empty slots available. @@ -171,6 +183,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None: Args: num_empty_slots (int): The minimum number of empty slots required. + extra_hash (Optional[int]): The hash value of additional + factors such as adapters that influence the block, apart + from the token_ids. """ # Currently the block table only supports # appending tokens to GPU blocks. @@ -187,7 +202,9 @@ def ensure_num_empty_slots(self, num_empty_slots: int) -> None: assert len(self._blocks) > 0 self._blocks.append( self._allocator.allocate_mutable_block( - prev_block=self._blocks[-1], device=device)) + prev_block=self._blocks[-1], + device=device, + extra_hash=extra_hash)) def fork(self) -> "BlockTable": """Creates a new BlockTable instance with a copy of the blocks from the @@ -259,9 +276,12 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: # ones after the appended ones. return sequence_token_ids[self.num_full_slots:] - def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], - token_ids: List[int], - device: Device) -> List[Block]: + def _allocate_blocks_for_token_ids( + self, + prev_block: Optional[Block], + token_ids: List[int], + device: Device, + extra_hash: Optional[int] = None) -> List[Block]: blocks: List[Block] = [] block_token_ids = [] @@ -275,8 +295,10 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], if block_token_ids: blocks.extend( self._allocator.allocate_immutable_blocks( - prev_block, block_token_ids=block_token_ids, - device=device)) + prev_block, + block_token_ids=block_token_ids, + device=device, + extra_hash=extra_hash)) prev_block = blocks[-1] if tail_token_ids: @@ -284,7 +306,7 @@ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], cur_token_ids = tail_token_ids[0] block = self._allocator.allocate_mutable_block( - prev_block=prev_block, device=device) + prev_block=prev_block, device=device, extra_hash=extra_hash) block.append_token_ids(cur_token_ids) blocks.append(block) diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index eb190adfbe802..c03b5932eafb6 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -177,7 +177,8 @@ def __init__(self, block_size: int, create_block: Block.Factory, token_ids=[], block_size=self._block_size, allocator=self._allocator, - block_id=None)) + block_id=None, + extra_hash=None)) def increase_pool(self): """Doubles the internal pool size @@ -194,10 +195,15 @@ def increase_pool(self): token_ids=[], block_size=self._block_size, allocator=self._allocator, - block_id=None)) - - def init_block(self, prev_block: Optional[Block], token_ids: List[int], - block_size: int, physical_block_id: Optional[int]) -> Block: + block_id=None, + extra_hash=None)) + + def init_block(self, + prev_block: Optional[Block], + token_ids: List[int], + block_size: int, + physical_block_id: Optional[int], + extra_hash: Optional[int] = None) -> Block: if len(self._free_ids) == 0: self.increase_pool() assert len(self._free_ids) > 0 @@ -210,7 +216,8 @@ def init_block(self, prev_block: Optional[Block], token_ids: List[int], token_ids=token_ids, block_size=block_size, allocator=block._allocator, # type: ignore[attr-defined] - block_id=physical_block_id) + block_id=physical_block_id, + extra_hash=extra_hash) block.pool_id = pool_id # type: ignore[attr-defined] return block diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 3197af3c2b7a4..3a57487a6cd8a 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -121,23 +121,32 @@ def allocate_or_get_null_block(self) -> Block: self.allocate_mutable_block(None, Device.GPU)) return self._null_block - def allocate_mutable_block(self, prev_block: Optional[Block], - device: Device) -> Block: + def allocate_mutable_block(self, + prev_block: Optional[Block], + device: Device, + extra_hash: Optional[int] = None) -> Block: """Allocates a new mutable block on the specified device. Args: prev_block (Optional[Block]): The previous block to in the sequence. Used for prefix hashing. device (Device): The device on which to allocate the new block. + extra_hash (Optional[int]): The hash value of additional + factors, such as adapters, that influence the block hash + in the prefix caching block. Returns: Block: The newly allocated mutable block. """ - return self._allocators[device].allocate_mutable_block(prev_block) - - def allocate_immutable_blocks(self, prev_block: Optional[Block], - block_token_ids: List[List[int]], - device: Device) -> List[Block]: + return self._allocators[device].allocate_mutable_block( + prev_block, extra_hash=extra_hash) + + def allocate_immutable_blocks( + self, + prev_block: Optional[Block], + block_token_ids: List[List[int]], + device: Device, + extra_hash: Optional[int] = None) -> List[Block]: """Allocates a new group of immutable blocks with the provided block token IDs on the specified device. @@ -147,17 +156,22 @@ def allocate_immutable_blocks(self, prev_block: Optional[Block], block_token_ids (List[int]): The list of block token IDs to be stored in the new blocks. device (Device): The device on which to allocate the new block. + extra_hash (Optional[int]): The hash value of additional + factors, such as adapters, that influence the block hash + in the prefix caching block. Returns: List[Block]: The newly allocated list of immutable blocks containing the provided block token IDs. """ return self._allocators[device].allocate_immutable_blocks( - prev_block, block_token_ids) + prev_block, block_token_ids, extra_hash=extra_hash) - def allocate_immutable_block(self, prev_block: Optional[Block], + def allocate_immutable_block(self, + prev_block: Optional[Block], token_ids: List[int], - device: Device) -> Block: + device: Device, + extra_hash: Optional[int] = None) -> Block: """Allocates a new immutable block with the provided token IDs on the specified device. @@ -167,13 +181,16 @@ def allocate_immutable_block(self, prev_block: Optional[Block], token_ids (List[int]): The list of token IDs to be stored in the new block. device (Device): The device on which to allocate the new block. + extra_hash (Optional[int]): The hash value of additional + factors, such as adapters, that influence the block hash + in the prefix caching block. Returns: Block: The newly allocated immutable block containing the provided token IDs. """ return self._allocators[device].allocate_immutable_block( - prev_block, token_ids) + prev_block, token_ids, extra_hash=extra_hash) def free(self, block: Block) -> None: """Frees the memory occupied by the given block. @@ -387,6 +404,10 @@ def is_full(self): def prev_block(self): return self._proxy.prev_block + @property + def extra_hash(self): + return None + @property def computed(self): return self._proxy.computed diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 06f4851af3466..985a1098b6cd1 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -50,6 +50,11 @@ def is_full(self) -> bool: def prev_block(self) -> Optional["Block"]: pass + @property + @abstractmethod + def extra_hash(self) -> Optional[int]: + return None + @property @abstractmethod def computed(self) -> bool: @@ -81,6 +86,8 @@ def __call__( block_size: int, allocator: "BlockAllocator", block_id: Optional[int] = None, + computed: bool = False, + extra_hash: Optional[int] = None, ) -> "Block": pass @@ -99,18 +106,20 @@ def content_hash(self) -> Optional[int]: class BlockAllocator(ABC): @abstractmethod - def allocate_mutable_block(self, prev_block: Optional[Block]) -> Block: + def allocate_mutable_block(self, prev_block: Optional[Block], + extra_hash: Optional[int]) -> Block: pass @abstractmethod def allocate_immutable_block(self, prev_block: Optional[Block], - token_ids: List[int]) -> Block: + token_ids: List[int], + extra_hash: Optional[int]) -> Block: pass @abstractmethod - def allocate_immutable_blocks( - self, prev_block: Optional[Block], - block_token_ids: List[List[int]]) -> List[Block]: + def allocate_immutable_blocks(self, prev_block: Optional[Block], + block_token_ids: List[List[int]], + extra_hash: Optional[int]) -> List[Block]: pass @abstractmethod @@ -197,14 +206,18 @@ def find_cached_blocks_prefix( class DeviceAwareBlockAllocator(ABC): @abstractmethod - def allocate_mutable_block(self, prev_block: Optional[Block], - device: Device) -> Block: + def allocate_mutable_block(self, + prev_block: Optional[Block], + device: Device, + extra_hash: Optional[int] = None) -> Block: pass @abstractmethod - def allocate_immutable_block(self, prev_block: Optional[Block], + def allocate_immutable_block(self, + prev_block: Optional[Block], token_ids: List[int], - device: Device) -> Block: + device: Device, + extra_hash: Optional[int] = None) -> Block: pass @abstractmethod @@ -213,6 +226,7 @@ def allocate_immutable_blocks( prev_block: Optional[Block], block_token_ids: List[List[int]], device: Device, + extra_hash: Optional[int] = None, ) -> List[Block]: pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index a2af5ad6362c1..9b94918ab38ef 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -63,6 +63,7 @@ def __init__( def allocate_immutable_block(self, prev_block: Optional[Block], token_ids: List[int], + extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates a new immutable block with the given token IDs, linked to the previous block. @@ -85,6 +86,7 @@ def allocate_immutable_blocks( self, prev_block: Optional[Block], block_token_ids: List[List[int]], + extra_hash: Optional[int] = None, device: Optional[Device] = None) -> List[Block]: assert device is None num_blocks = len(block_token_ids) @@ -106,6 +108,7 @@ def allocate_immutable_blocks( def allocate_mutable_block(self, prev_block: Optional[Block], + extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates a new mutable block, linked to the previous block. @@ -355,7 +358,8 @@ def __init__(self, block_size: int, allocator: BlockAllocator, block_id: Optional[int] = None, - _cow_target: Optional[Block] = None): + _cow_target: Optional[Block] = None, + extra_hash: Optional[int] = None): self._token_ids: List[int] = [] self._block_size = block_size self._prev_block = prev_block @@ -441,6 +445,10 @@ def block_size(self) -> int: def prev_block(self) -> Optional["Block"]: return self._prev_block + @property + def extra_hash(self): + return None + @property def content_hash(self) -> Optional[int]: return None diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index b736167f6ceb4..1238303234deb 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -126,6 +126,7 @@ def _create_block( allocator: BlockAllocator, block_id: Optional[int] = None, computed: bool = False, + extra_hash: Optional[int] = None, ) -> Block: # Bind block to self. allocator = self @@ -137,11 +138,13 @@ def _create_block( block_id=block_id, allocator=allocator, computed=computed, + extra_hash=extra_hash, ) def allocate_immutable_block(self, prev_block: Optional[Block], token_ids: List[int], + extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates an immutable block with the given token IDs, reusing cached blocks if possible. @@ -160,7 +163,8 @@ def allocate_immutable_block(self, block = self._block_pool.init_block(prev_block=prev_block, token_ids=token_ids, block_size=self._block_size, - physical_block_id=None) + physical_block_id=None, + extra_hash=extra_hash) assert block.content_hash is not None cached_block_id = self._cached_blocks.get(block.content_hash, None) @@ -173,7 +177,7 @@ def allocate_immutable_block(self, self._block_pool.free_block(block) # No cached block => Allocate a new block - block = self.allocate_mutable_block(prev_block) + block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash) block.append_token_ids(token_ids) return block @@ -181,17 +185,20 @@ def allocate_immutable_blocks( self, prev_block: Optional[Block], block_token_ids: List[List[int]], + extra_hash: Optional[int] = None, device: Optional[Device] = None) -> List[Block]: blocks = [] for token_ids in block_token_ids: prev_block = self.allocate_immutable_block(prev_block=prev_block, token_ids=token_ids, - device=device) + device=device, + extra_hash=extra_hash) blocks.append(prev_block) return blocks def allocate_mutable_block(self, prev_block: Optional[Block], + extra_hash: Optional[int] = None, device: Optional[Device] = None) -> Block: """Allocates a mutable block. If there are no free blocks, this will evict unused cached blocks. @@ -210,7 +217,8 @@ def allocate_mutable_block(self, block = self._block_pool.init_block(prev_block=prev_block, token_ids=[], block_size=self._block_size, - physical_block_id=block_id) + physical_block_id=block_id, + extra_hash=extra_hash) assert not block.computed assert block.content_hash is None return block @@ -382,7 +390,8 @@ def fork(self, last_block: Block) -> List[Block]: prev_block=prev_block, token_ids=block.token_ids, block_size=self._block_size, - physical_block_id=block_id) + physical_block_id=block_id, + extra_hash=block.extra_hash) forked_blocks.append(forked_block) prev_block = forked_blocks[-1] @@ -608,10 +617,12 @@ def swap_in(self, blocks: List[Block]) -> None: # existing "block" object if block.is_full: tmp_block = self.allocate_immutable_block( - prev_block=block.prev_block, token_ids=block.token_ids) + prev_block=block.prev_block, + token_ids=block.token_ids, + extra_hash=block.extra_hash) else: tmp_block = self.allocate_mutable_block( - prev_block=block.prev_block) + prev_block=block.prev_block, extra_hash=block.extra_hash) tmp_block.append_token_ids(block.token_ids) block_id = tmp_block.block_id @@ -679,6 +690,8 @@ class PrefixCachingBlock(Block): caching block allocator associated with this block. block_id (Optional[int], optional): The physical block index of this block. Defaults to None. + extra_hash (Optional[int]): The hash value of additional factors + such as adapters that influence the block, apart from the token_ids. """ def __init__( @@ -689,6 +702,7 @@ def __init__( allocator: BlockAllocator, block_id: Optional[int] = None, computed: bool = False, + extra_hash: Optional[int] = None, ): assert isinstance(allocator, PrefixCachingBlockAllocator), ( "Currently this class is only tested with " @@ -702,6 +716,7 @@ def __init__( self._allocator = allocator self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME self._computed = computed + self._extra_hash = extra_hash # On the first time, we create the block object, and next we only # reinitialize it @@ -811,6 +826,10 @@ def token_ids(self) -> List[int]: def prev_block(self) -> Optional[Block]: return self._prev_block + @property + def extra_hash(self) -> Optional[int]: + return self._extra_hash + @property def content_hash(self) -> Optional[int]: """Return the content-based hash of the current block, or None if it is @@ -841,18 +860,19 @@ def content_hash(self) -> Optional[int]: self._cached_content_hash = PrefixCachingBlock.hash_block_tokens( is_first_block, prev_block_hash, - cur_block_token_ids=self.token_ids) + cur_block_token_ids=self.token_ids, + extra_hash=self._extra_hash) return self._cached_content_hash @staticmethod - def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int], - cur_block_token_ids: List[int]) -> int: + def hash_block_tokens(is_first_block: bool, + prev_block_hash: Optional[int], + cur_block_token_ids: List[int], + extra_hash: Optional[int] = None) -> int: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. - NOTE: Content-based hashing does not yet support LoRA. - Parameters: - is_first_block (bool): A flag indicating if the block is the first in the sequence. @@ -860,12 +880,15 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int], if this is the first block. - cur_block_token_ids (List[int]): A list of token ids in the current block. The current block is assumed to be full. + - extra_hash (Optional[int]): The hash value of additional factors + such as adapters that influence the block, apart from the token_ids. Returns: - int: The computed hash value for the block. """ assert (prev_block_hash is None) == is_first_block - return hash((is_first_block, prev_block_hash, *cur_block_token_ids)) + return hash((is_first_block, prev_block_hash, *cur_block_token_ids, + extra_hash)) class ComputedBlocksTracker: @@ -935,12 +958,18 @@ def _update_seq_hashes(self, seq: Sequence) -> None: assert len(token_ids) >= (i + 1) * self._block_size block_token_ids = token_ids[i * self._block_size:(i + 1) * self._block_size] + + # NOTE: If there are any factors affecting the block besides + # token_ids, they should be added as input to extra_hash. + extra_hash = seq.extra_hash() + # This has to be kept in sync with the allocator's hash # calculation. block_hash = PrefixCachingBlock.hash_block_tokens( is_first_block=prev_block_hash is None, prev_block_hash=prev_block_hash, cur_block_token_ids=block_token_ids, + extra_hash=extra_hash, ) block_hashes_recorded.append(block_hash) prev_block_hash = block_hash diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 209487c6b4f9e..b41e848221882 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -151,8 +151,13 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: max_block_sliding_window=self.max_block_sliding_window, ) if seq.get_token_ids(): + # NOTE: If there are any factors affecting the block besides + # token_ids, they should be added as input to extra_hash. + extra_hash = seq.extra_hash() + # Add blocks to the block table only if the sequence is non empty. - block_table.allocate(seq.get_token_ids()) + block_table.allocate(token_ids=seq.get_token_ids(), + extra_hash=extra_hash) return block_table @@ -238,6 +243,7 @@ def append_slots( token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()), num_lookahead_slots=num_lookahead_slots, num_computed_slots=seq.data.get_num_computed_tokens(), + extra_hash=seq.extra_hash(), ) # Return any new copy-on-writes. new_cows = self.block_allocator.clear_copy_on_writes() diff --git a/vllm/sequence.py b/vllm/sequence.py index ddb9ca5944f10..cc3d96fc93a79 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -527,6 +527,19 @@ def hash_of_block(self, logical_idx: int) -> int: hashed_tokens = self.data.get_prefix_token_ids(num_tokens) return hash((hashed_tokens, self.lora_int_id)) + def extra_hash(self) -> Optional[int]: + """ + This function computes an extra hash for a sequence, specifically + designed for prefix caching mode. The final sequence hash is determined + by applying token_ids from the sequence's blocks. + """ + if self.prompt_adapter_id == 0 and self.lora_int_id == 0: + return None + + # NOTE: If there are additional factors influencing the block aside from + # token_ids, include them as input parameters to the hash. + return hash((self.prompt_adapter_id, self.lora_int_id)) + def num_hashed_tokens_of_block(self, logical_idx: int): return logical_idx * self.block_size + self.block_size From 5b0ed8391d497439595a1968d65df93da98265ca Mon Sep 17 00:00:00 2001 From: zhangjf <1061683512@qq.com> Date: Fri, 13 Dec 2024 23:56:19 +0800 Subject: [PATCH 14/56] [Bugfix] using len(tokenizer) instead of tokenizer.vocab_size in AllowedTokenIdsLogitsProcessor (#11156) --- vllm/entrypoints/openai/logits_processors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index 7913f8720ca73..c8132811de903 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -71,7 +71,7 @@ def get_logits_processors( # Check if token_id is within the vocab size for token_id, bias in clamped_logit_bias.items(): - if token_id < 0 or token_id >= tokenizer.vocab_size: + if token_id < 0 or token_id >= len(tokenizer): raise ValueError(f"token_id {token_id} in logit_bias contains " "out-of-vocab token id") @@ -81,6 +81,6 @@ def get_logits_processors( if allowed_token_ids is not None: logits_processors.append( _get_allowed_token_ids_logits_processor( - frozenset(allowed_token_ids), tokenizer.vocab_size)) + frozenset(allowed_token_ids), len(tokenizer))) return logits_processors From 238c0d93b40008244fae64530d82f1860b1f9121 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:19:10 -0500 Subject: [PATCH 15/56] [Misc] Add tokenizer_mode param to benchmark_serving.py (#11174) Signed-off-by: Alexander Matveev --- benchmarks/benchmark_serving.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3256692142c5e..4eb0e1f8ac903 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -781,6 +781,7 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + tokenizer_mode = args.tokenizer_mode if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" @@ -790,6 +791,7 @@ def main(args: argparse.Namespace): base_url = f"http://{args.host}:{args.port}" tokenizer = get_tokenizer(tokenizer_id, + tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) if args.dataset is not None: @@ -1210,5 +1212,15 @@ def main(args: argparse.Namespace): "from the sampled HF dataset.", ) + parser.add_argument( + '--tokenizer-mode', + type=str, + default="auto", + choices=['auto', 'slow', 'mistral'], + help='The tokenizer mode.\n\n* "auto" will use the ' + 'fast tokenizer if available.\n* "slow" will ' + 'always use the slow tokenizer. \n* ' + '"mistral" will always use the `mistral_common` tokenizer.') + args = parser.parse_args() main(args) From 0920ab9131274df143cfc49245409378a009b3c6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 14 Dec 2024 00:22:22 +0800 Subject: [PATCH 16/56] [Doc] Reorganize online pooling APIs (#11172) Signed-off-by: DarkLight1337 --- docs/source/models/pooling_models.rst | 14 +- .../serving/openai_compatible_server.md | 462 ++++++++++-------- docs/source/usage/multimodal_inputs.rst | 8 +- examples/offline_inference_openai.md | 92 ++-- ...ai_chat_embedding_client_for_multimodal.py | 2 +- examples/openai_cross_encoder_score.py | 35 +- tests/entrypoints/openai/test_score.py | 6 +- vllm/entrypoints/openai/api_server.py | 11 +- vllm/entrypoints/openai/protocol.py | 7 +- vllm/outputs.py | 19 +- 10 files changed, 368 insertions(+), 288 deletions(-) diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst index 94475c5e6689d..4e67677a2767a 100644 --- a/docs/source/models/pooling_models.rst +++ b/docs/source/models/pooling_models.rst @@ -50,10 +50,10 @@ It returns the extracted hidden states directly, which is useful for reward mode .. code-block:: python llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") - output, = llm.encode("Hello, my name is") + (output,) = llm.encode("Hello, my name is") data = output.outputs.data - print(f"Prompt: {prompt!r} | Data: {data!r}") + print(f"Data: {data!r}") ``LLM.embed`` ^^^^^^^^^^^^^ @@ -64,7 +64,7 @@ It is primarily designed for embedding models. .. code-block:: python llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") - output, = llm.embed("Hello, my name is") + (output,) = llm.embed("Hello, my name is") embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") @@ -80,7 +80,7 @@ It is primarily designed for classification models. .. code-block:: python llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") - output, = llm.classify("Hello, my name is") + (output,) = llm.classify("Hello, my name is") probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") @@ -102,8 +102,8 @@ These types of models serve as rerankers between candidate query-document pairs .. code-block:: python llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") - output, = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") + (output,) = llm.score("What is the capital of France?", + "The capital of Brazil is Brasilia.") score = output.outputs.score print(f"Score: {score}") @@ -119,7 +119,7 @@ Please click on the above link for more details on how to launch the server. Embeddings API ^^^^^^^^^^^^^^ -Our Embeddings API is similar to ``LLM.encode``, accepting both text and :ref:`multi-modal inputs `. +Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs `. The text-only API is compatible with `OpenAI Embeddings API `__ so that you can use OpenAI client to interact with it. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index f75653106cf66..14a5b02d72aa5 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -1,13 +1,13 @@ # OpenAI Compatible Server -vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. +vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! -You can start the server using Python, or using [Docker](deploying_with_docker.rst): +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` -To call the server, you can use the official OpenAI Python client library, or any other HTTP client. +To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. ```python from openai import OpenAI client = OpenAI( @@ -25,166 +25,76 @@ completion = client.chat.completions.create( print(completion.choices[0].message) ``` -## API Reference +## Supported APIs We currently support the following OpenAI APIs: -- [Completions API](https://platform.openai.com/docs/api-reference/completions) +- [Completions API](#completions-api) (`/v1/completions`) + - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`). - *Note: `suffix` parameter is not supported.* -- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) +- [Chat Completions API](#chat-api) (`/v1/chat/completions`) + - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - *Note: `image_url.detail` parameter is not supported.* - We also support `audio_url` content type for audio files. - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* - *Note: `parallel_tool_calls` and `user` parameters are ignored.* -- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), - which will be treated as a single prompt to the model according to its chat template. - - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. - - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* - -## Score API for Cross Encoder Models +- [Embeddings API](#embeddings-api) (`/v1/embeddings`) + - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). -vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). +In addition, we have the following custom APIs: -A ***Cross Encoder*** takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 … 1. +- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) + - Applicable to any model with a tokenizer. +- [Score API](#score-api) (`/score`) + - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`). -### Example of usage for a pair of a string and a list of texts +(chat-template)= +## Chat Template -In this case, the model will compare the first given text to each of the texts containing the list. +In order for the language model to support chat protocol, vLLM requires the model to include +a chat template in its tokenizer configuration. The chat template is a Jinja2 template that +specifies how are roles, messages, and other chat-specific tokens are encoded in the input. -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/v1/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "text_1": "What is the capital of France?", - "text_2": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris." - ] -}' -``` +An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models) -Response: +Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model, +you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat +template, or the template in string form. Without a chat template, the server will not be able to process chat +and all chat requests will error. ```bash -{ - "id": "score-request-id", - "object": "list", - "created": 693570, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": [ - 0.001094818115234375 - ] - }, - { - "index": 1, - "object": "score", - "score": [ - 1 - ] - } - ], - "usage": {} -} +vllm serve --chat-template ./path-to-chat-template.jinja ``` -### Example of usage for a pair of two lists of texts - -In this case, the model will compare the one by one, making pairs by same index correspondent in each list. +vLLM community provides a set of chat templates for popular models. You can find them in the examples +directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/v1/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "encoding_format": "float", - "text_1": [ - "What is the capital of Brazil?", - "What is the capital of France?" - ], - "text_2": [ - "The capital of Brazil is Brasilia.", - "The capital of France is Paris." +With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies +both a `type` and a `text` field. An example is provided below: +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} ] -}' -``` - -Response: - -```bash -{ - "id": "score-request-id", - "object": "list", - "created": 693447, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": [ - 1 - ] - }, - { - "index": 1, - "object": "score", - "score": [ - 1 - ] - } - ], - "usage": {} -} +) ``` -### Example of usage for a pair of two strings - -In this case, the model will compare the strings of texts. - -```bash -curl -X 'POST' \ - 'http://127.0.0.1:8000/v1/score' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "BAAI/bge-reranker-v2-m3", - "encoding_format": "float", - "text_1": "What is the capital of France?", - "text_2": "The capital of France is Paris." -}' -``` +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the +request. vLLM provides best-effort support to detect this automatically, which is logged as a string like +*"Detected the chat template content format to be..."*, and internally converts incoming requests to match +the detected format, which can be one of: -Response: +- `"string"`: A string. + - Example: `"Hello world"` +- `"openai"`: A list of dictionaries, similar to OpenAI schema. + - Example: `[{"type": "text", "text": "Hello world!"}]` -```bash -{ - "id": "score-request-id", - "object": "list", - "created": 693447, - "model": "BAAI/bge-reranker-v2-m3", - "data": [ - { - "index": 0, - "object": "score", - "score": [ - 1 - ] - } - ], - "usage": {} -} -``` +If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument +to override which format to use. ## Extra Parameters @@ -204,7 +114,7 @@ completion = client.chat.completions.create( ) ``` -### Extra HTTP Headers +## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. @@ -230,7 +140,53 @@ completion = client.completions.create( print(completion._request_id) ``` -### Extra Parameters for Completions API +## CLI Reference + +(vllm-serve)= +### `vllm serve` + +The `vllm serve` command is used to launch the OpenAI-compatible server. + +```{argparse} +:module: vllm.entrypoints.openai.cli_args +:func: create_parser_for_docs +:prog: vllm serve +``` + +#### Configuration file + +You can load CLI arguments via a [YAML](https://yaml.org/) config file. +The argument names must be the long form of those outlined [above](#vllm-serve). + +For example: + +```yaml +# config.yaml + +host: "127.0.0.1" +port: 6379 +uvicorn-log-level: "info" +``` + +To use the above config file: + +```bash +$ vllm serve SOME_MODEL --config config.yaml +``` + +```{note} +In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. +The order of priorities is `command line > config file values > defaults`. +``` + +## API Reference + +(completions-api)= +### Completions API + +Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/completions) for more details. + +#### Extra parameters The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. @@ -248,7 +204,12 @@ The following extra parameters are supported: :end-before: end-completion-extra-params ``` -### Extra Parameters for Chat Completions API +(chat-api)= +### Chat Completions API + +Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details. + +#### Extra parameters The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. @@ -266,7 +227,19 @@ The following extra parameters are supported: :end-before: end-chat-completion-extra-params ``` -### Extra Parameters for Embeddings API +(embeddings-api)= +### Embeddings API + +Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/embeddings) for more details. + +If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat Completions API](#chat-api)) +which will be treated as a single prompt to the model. + +```{tip} +This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. +``` + +#### Extra parameters The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. @@ -276,7 +249,7 @@ The following [pooling parameters (click through to see documentation)](../dev/p :end-before: end-embedding-pooling-params ``` -The following extra parameters are supported: +The following extra parameters are supported by default: ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -284,80 +257,179 @@ The following extra parameters are supported: :end-before: end-embedding-extra-params ``` -## Chat Template +For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -In order for the language model to support chat protocol, vLLM requires the model to include -a chat template in its tokenizer configuration. The chat template is a Jinja2 template that -specifies how are roles, messages, and other chat-specific tokens are encoded in the input. +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-embedding-extra-params +:end-before: end-chat-embedding-extra-params +``` -An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models) +(tokenizer-api)= +### Tokenizer API -Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model, -you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat -template, or the template in string form. Without a chat template, the server will not be able to process chat -and all chat requests will error. +The Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). +It consists of two endpoints: + +- `/tokenize` corresponds to calling `tokenizer.encode()`. +- `/detokenize` corresponds to calling `tokenizer.decode()`. + +(score-api)= +### Score API + +The Score API applies a cross-encoder model to predict scores for sentence pairs. +Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1. + +You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). + +#### Single inference + +You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. + +Request: ```bash -vllm serve --chat-template ./path-to-chat-template.jinja +curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": "What is the capital of France?", + "text_2": "The capital of France is Paris." +}' ``` -vLLM community provides a set of chat templates for popular models. You can find them in the examples -directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) +Response: -With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies -both a `type` and a `text` field. An example is provided below: -```python -completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] -) +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 1 + } + ], + "usage": {} +} ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like -`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the -request. vLLM provides best-effort support to detect this automatically, which is logged as a string like -*"Detected the chat template content format to be..."*, and internally converts incoming requests to match -the detected format, which can be one of: +#### Batch inference -- `"string"`: A string. - - Example: `"Hello world"` -- `"openai"`: A list of dictionaries, similar to OpenAI schema. - - Example: `[{"type": "text", "text": "Hello world!"}]` +You can pass a string to `text_1` and a list to `text_2`, forming multiple sentence pairs +where each pair is built from `text_1` and a string in `text_2`. +The total number of pairs is `len(text_2)`. -If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument -to override which format to use. +Request: -## Command line arguments for the server +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "text_1": "What is the capital of France?", + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] +}' +``` -```{argparse} -:module: vllm.entrypoints.openai.cli_args -:func: create_parser_for_docs -:prog: vllm serve +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693570, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 0.001094818115234375 + }, + { + "index": 1, + "object": "score", + "score": 1 + } + ], + "usage": {} +} ``` +You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs +where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). +The total number of pairs is `len(text_2)`. + +Request: + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": [ + "What is the capital of Brazil?", + "What is the capital of France?" + ], + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] +}' +``` -### Config file +Response: -The `serve` module can also accept arguments from a config file in -`yaml` format. The arguments in the yaml must be specified using the -long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": 1 + }, + { + "index": 1, + "object": "score", + "score": 1 + } + ], + "usage": {} +} +``` -For example: +#### Extra parameters -```yaml -# config.yaml +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. -host: "127.0.0.1" -port: 6379 -uvicorn-log-level: "info" +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-score-pooling-params +:end-before: end-score-pooling-params ``` -```bash -$ vllm serve SOME_MODEL --config config.yaml +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-score-extra-params +:end-before: end-score-extra-params ``` ---- -**NOTE** -In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence. -The order of priorities is `command line > config file values > defaults`. diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index c93f65327e31b..1e00f26f9a3ba 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -345,12 +345,12 @@ Here is an end-to-end example using VLM2Vec. To serve the model: .. code-block:: bash - vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja .. important:: - Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed`` to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, @@ -386,12 +386,12 @@ Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` mo .. code-block:: bash - vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja .. important:: - Like with VLM2Vec, we have to explicitly pass ``--task embedding``. + Like with VLM2Vec, we have to explicitly pass ``--task embed``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled by `this custom chat template `__. diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md index 4c64197975534..2436417cb543a 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference_openai.md @@ -1,45 +1,48 @@ # Offline Inference with the OpenAI Batch file format - **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API. - - ## File Format - - The OpenAI batch file format consists of a series of json objects on new lines. +```{important} +This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API. +``` + +## File Format - [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) +The OpenAI batch file format consists of a series of json objects on new lines. - Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) - **NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). +Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. - ## Pre-requisites +```{note} +We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). +``` -* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`. +## Pre-requisites + * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`. - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) - Install the token on your machine (Run `huggingface-cli login`). - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. - ## Example 1: Running with a local file - - ### Step 1: Create your batch file - - To follow along with this example, you can download the example batch, or create your own batch file in your working directory. - - ``` - wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl - ``` - - Once you've created your batch file it should look like this - - ``` - $ cat openai_example_batch.jsonl +## Example 1: Running with a local file + +### Step 1: Create your batch file + +To follow along with this example, you can download the example batch, or create your own batch file in your working directory. + +``` +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +``` + +Once you've created your batch file it should look like this + +``` +$ cat openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} - ``` - - ### Step 2: Run the batch +``` + +### Step 2: Run the batch The batch running tool is designed to be used from the command line. @@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls. ### Step 1: Upload your input script To follow along with this example, you can download the example batch, or create your own batch file in your working directory. - - ``` - wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl - ``` - - Once you've created your batch file it should look like this - - ``` - $ cat openai_example_batch.jsonl + +``` +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl +``` + +Once you've created your batch file it should look like this + +``` +$ cat openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} - ``` +``` Now upload your batch file to your S3 bucket. @@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket. aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` - ### Step 2: Generate your presigned urls Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names. @@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - ### Step 1: Create your batch file - Add embedding requests to your batch file. The following is an example: +Add embedding requests to your batch file. The following is an example: - ``` - {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}} +``` +{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}} {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}} ``` - - You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model). +You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model). - ### Step 2: Run the batch +### Step 2: Run the batch You can run the batch using the same command as in earlier examples. - ### Step 3: Check your results You can check your results by running `cat results.jsonl` @@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl` ``` $ cat results.jsonl {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} -...``` +... ``` diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py index fff82020d9a30..a56e7429b7567 100644 --- a/examples/openai_chat_embedding_client_for_multimodal.py +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -99,7 +99,7 @@ def dse_qwen2_vl(inp: dict): if __name__ == '__main__': parser = argparse.ArgumentParser( "Script to call a specified VLM through the API. Make sure to serve " - "the model with --task embedding before running this.") + "the model with --task embed before running this.") parser.add_argument("model", type=str, choices=["vlm2vec", "dse_qwen2_vl"], diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py index 8c32eea5dd252..a06af8df5d3fe 100644 --- a/examples/openai_cross_encoder_score.py +++ b/examples/openai_cross_encoder_score.py @@ -1,14 +1,15 @@ -"""Examples Python client Score for Cross Encoder Models """ +Example online usage of Score API. +Run `vllm serve --task score` to start up the server in vLLM. +""" import argparse -import json import pprint import requests -def post_http_request(prompt: json, api_url: str) -> requests.Response: +def post_http_request(prompt: dict, api_url: str) -> requests.Response: headers = {"User-Agent": "Test Client"} response = requests.post(api_url, headers=headers, json=prompt) return response @@ -20,20 +21,29 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response: parser.add_argument("--port", type=int, default=8000) parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") args = parser.parse_args() - api_url = f"http://{args.host}:{args.port}/v1/score" + api_url = f"http://{args.host}:{args.port}/score" model_name = args.model + text_1 = "What is the capital of Brazil?" + text_2 = "The capital of Brazil is Brasilia." + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt when text_1 and text_2 are both strings:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.json()) + text_1 = "What is the capital of France?" text_2 = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." ] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt for text_1 is string and text_2 is a list:") + print("Prompt when text_1 is string and text_2 is a list:") pprint.pprint(prompt) print("Score Response:") - pprint.pprint(score_response.data) + pprint.pprint(score_response.json()) text_1 = [ "What is the capital of Brazil?", "What is the capital of France?" @@ -43,16 +53,7 @@ def post_http_request(prompt: json, api_url: str) -> requests.Response: ] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt for text_1 and text_2 are lists:") - pprint.pprint(prompt) - print("Score Response:") - pprint.pprint(score_response.data) - - text_1 = "What is the capital of Brazil?" - text_2 = "The capital of Brazil is Brasilia." - prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} - score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt for text_1 and text_2 are strings:") + print("Prompt when text_1 and text_2 are both lists:") pprint.pprint(prompt) print("Score Response:") - pprint.pprint(score_response.data) \ No newline at end of file + pprint.pprint(score_response.json()) diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 0698c19ad0023..a803ea4a8d6ad 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -27,7 +27,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, "The capital of Brazil is Brasilia.", "The capital of France is Paris." ] - score_response = requests.post(server.url_for("v1/score"), + score_response = requests.post(server.url_for("score"), json={ "model": model_name, "text_1": text_1, @@ -55,7 +55,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, "The capital of Brazil is Brasilia.", "The capital of France is Paris." ] - score_response = requests.post(server.url_for("v1/score"), + score_response = requests.post(server.url_for("score"), json={ "model": model_name, "text_1": text_1, @@ -78,7 +78,7 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." - score_response = requests.post(server.url_for("v1/score"), + score_response = requests.post(server.url_for("score"), json={ "model": model_name, "text_1": text_1, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2e27224b41864..14e3a34ce141c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -406,7 +406,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) -@router.post("/v1/score") +@router.post("/score") async def create_score(request: ScoreRequest, raw_request: Request): handler = score(raw_request) if handler is None: @@ -423,6 +423,15 @@ async def create_score(request: ScoreRequest, raw_request: Request): assert_never(generator) +@router.post("/v1/score") +async def create_score_v1(request: ScoreRequest, raw_request: Request): + logger.warning( + "To indicate that Score API is not part of standard OpenAI API, we " + "have moved it to `/score`. Please update your client accordingly.") + + return await create_score(request, raw_request) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 34c9f0a96216f..f4e7740ea0cff 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -812,10 +812,11 @@ class ScoreRequest(OpenAIBaseModel): text_2: Union[List[str], str] truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None - # doc: begin-chat-embedding-pooling-params + # doc: begin-score-pooling-params additional_data: Optional[Any] = None - # doc: end-chat-embedding-pooling-params + # doc: end-score-pooling-params + # doc: begin-score-extra-params priority: int = Field( default=0, description=( @@ -823,6 +824,8 @@ class ScoreRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) + # doc: end-score-extra-params + def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) diff --git a/vllm/outputs.py b/vllm/outputs.py index 8c6c1aca3a917..2ecdf74ee59b3 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,12 +1,11 @@ import time -import warnings from dataclasses import dataclass from typing import Dict, Generic, List, Optional from typing import Sequence as GenericSequence from typing import Union import torch -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict @@ -73,13 +72,11 @@ def __eq__(self, other: object) -> bool: (self.data == other.data).all())) @property + @deprecated("`LLM.encode()` now stores raw outputs in the `data` " + "attribute. To return embeddings, use `LLM.embed()`. " + "To return class probabilities, use `LLM.classify()` " + "and access the `probs` attribute. ") def embedding(self) -> list[float]: - msg = ("`LLM.encode()` now returns raw outputs. " - "To return embeddings, use `LLM.embed()`. " - "To return class probabilities, use `LLM.classify()` " - "and access the `probs` attribute. ") - warnings.warn(msg, DeprecationWarning, stacklevel=2) - return self.data.tolist() @@ -491,11 +488,9 @@ def __repr__(self) -> str: return f"ScoringOutput(score={self.score})" @property + @deprecated("`LLM.score()` now returns scalar scores. " + "Please access it via the `score` attribute. ") def embedding(self) -> list[float]: - msg = ("`LLM.score()` now returns scalar scores. " - "Please access it via the `score` attribute. ") - warnings.warn(msg, DeprecationWarning, stacklevel=2) - return [self.score] From 0a56bcc03de0857be464c3f8783258d590cbc762 Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Fri, 13 Dec 2024 20:00:40 +0200 Subject: [PATCH 17/56] [Bugfix][Hardware][CPU] Enable Gemma2 with SDPA on CPU backend (#11169) --- vllm/attention/backends/torch_sdpa.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 86e952a903f36..0cff6f5952aba 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -13,7 +13,7 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.ipex_attn import PagedAttention from vllm.attention.ops.paged_attn import PagedAttentionMetadata -from vllm.utils import make_tensor_with_pad +from vllm.utils import make_tensor_with_pad, print_warning_once from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder @@ -395,7 +395,8 @@ def __init__( raise ValueError( "Torch SPDA does not support block-sparse attention.") if logits_soft_cap is not None: - raise ValueError("Torch SPDA does not support logits soft cap.") + print_warning_once("Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -619,7 +620,7 @@ def _run_sdpa_forward( value[None, :, start_kv:end_kv, :], attn_mask=mask, dropout_p=0.0, - is_causal=causal_attn and not self.need_mask, + is_causal=causal_attn and mask is None, scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) output[start_q:end_q, :, :] = sub_out start_q, start_kv = end_q, end_kv From 0d8451c3a45d309e58de5e1c546f043de461d478 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Fri, 13 Dec 2024 12:17:37 -0800 Subject: [PATCH 18/56] [Distributed] Allow the placement group more time to wait for resources to be ready (#11138) Signed-off-by: Jiaxin Shan --- vllm/executor/ray_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4f28efd639084..426aa1b5c728f 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -277,10 +277,14 @@ def initialize_ray_cluster( f"Total number of devices: {device_bundles}.") else: num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) + # Log a warning message and delay resource allocation failure response. + # Avoid immediate rejection to allow user-initiated placement group + # created and wait cluster to be ready if parallel_config.world_size > num_devices_in_cluster: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group.") + logger.warning( + "The number of required %ss exceeds the total " + "number of available %ss in the placement group.", device_str, + device_str) # Create a new placement group placement_group_specs: List[Dict[str, float]] = ([{ device_str: 1.0 From 4863e5fba51b8e1a5012e2a7582aece0ca575b89 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 13 Dec 2024 19:27:32 -0500 Subject: [PATCH 19/56] [Core] V1: Use multiprocessing by default (#11074) Signed-off-by: Russell Bryant --- docs/source/design/multiprocessing.md | 195 ++++++++++++++++++++++ docs/source/getting_started/debugging.rst | 56 +++++++ docs/source/index.rst | 1 + vllm/entrypoints/llm.py | 4 + vllm/envs.py | 4 +- vllm/executor/multiproc_worker_utils.py | 20 ++- vllm/v1/engine/core.py | 8 +- vllm/v1/engine/core_client.py | 11 +- vllm/v1/engine/llm_engine.py | 7 + vllm/v1/executor/multiproc_executor.py | 10 +- 10 files changed, 299 insertions(+), 17 deletions(-) create mode 100644 docs/source/design/multiprocessing.md diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md new file mode 100644 index 0000000000000..b58456ecc6da8 --- /dev/null +++ b/docs/source/design/multiprocessing.md @@ -0,0 +1,195 @@ +# Python Multiprocessing + +## Debugging + +Please see the [Debugging +Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing) +page for information on known issues and how to solve them. + +## Introduction + +*Note that source code references are to the state of the code at the time of writing in December, 2024.* + +The use of Python multiprocessing in vLLM is complicated by: + +- The use of vLLM as a library and the inability to control the code using vLLM +- Varying levels of incompatibilities between multiprocessing methods and vLLM + dependencies + +This document describes how vLLM deals with these challenges. + +## Multiprocessing Methods + +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: + +- `spawn` - spawn a new Python process. This will be the default as of Python + 3.14. + +- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default + in Python versions prior to 3.14. + +- `forkserver` - Spawn a server process that will fork a new process on request. + +### Tradeoffs + +`fork` is the fastest method, but is incompatible with dependencies that use +threads. + +`spawn` is more compatible with dependencies, but can be problematic when vLLM +is used as a library. If the consuming code does not use a `__main__` guard (`if +__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM +spawns a new process. This can lead to infinite recursion, among other problems. + +`forkserver` will spawn a new server process that will fork new processes on +demand. This unfortunately has the same problem as `spawn` when vLLM is used as +a library. The server process is created as a spawned new process, which will +re-execute code not protected by a `__main__` guard. + +For both `spawn` and `forkserver`, the process must not depend on inheriting any +global state as would be the case with `fork`. + +## Compatibility with Dependencies + +Multiple vLLM dependencies indicate either a preference or requirement for using +`spawn`: + +- +- +- + +It is perhaps more accurate to say that there are known problems with using +`fork` after initializing these dependencies. + +## Current State (v0) + +The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`. + +- + +When we know we own the process because the `vllm` command was used, we use +`spawn` because it's the most widely compatible. + +- + +The `multiproc_xpu_executor` forces the use of `spawn`. + +- + +There are other miscellaneous places hard-coding the use of `spawn`: + +- +- + +Related PRs: + +- + +## Prior State in v1 + +There was an environment variable to control whether multiprocessing is used in +the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off. + +- + +When it was enabled, the v1 `LLMEngine` would create a new process to run the +engine core. + +- +- +- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45 + +It was off by default for all the reasons mentioned above - compatibility with +dependencies and code using vLLM as a library. + +### Changes Made in v1 + +There is not an easy solution with Python's `multiprocessing` that will work +everywhere. As a first step, we can get v1 into a state where it does "best +effort" choice of multiprocessing method to maximize compatibility. + +- Default to `fork`. +- Use `spawn` when we know we control the main process (`vllm` was executed). +- If we detect `cuda` was previously initialized, force `spawn` and emit a + warning. We know `fork` will break, so this is the best we can do. + +The case that is known to still break in this scenario is code using vLLM as a +library that initializes `cuda` before calling vLLM. The warning we emit should +instruct users to either add a `__main__` guard or to disable multiprocessing. + +If that known-failure case occurs, the user will see two messages that explain +what is happening. First, a log message from vLLM: + +``` + WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. +``` + +Second, Python itself will raise an exception with a nice explanation: + +``` +RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. + + To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html +``` + +## Alternatives Considered + +### Detect if a `__main__` guard is present + +It has been suggested that we could behave better if we could detect whether +code using vLLM as a library has a `__main__` guard in place. This [post on +stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard) +was from a library author facing the same question. + +It is possible to detect whether we are in the original, `__main__` process, or +a subsequent spawned process. However, it does not appear to be straight forward +to detect whether a `__main__` guard is present in the code. + +This option has been discarded as impractical. + +### Use `forkserver` + +At first it appears that `forkserver` is a nice solution to the problem. +However, the way it works presents the same challenges that `spawn` does when +vLLM is used as a library. + +### Force `spawn` all the time + +One way to clean this up is to just force the use of `spawn` all the time and +document that the use of a `__main__` guard is required when using vLLM as a +library. This would unfortunately break existing code and make vLLM harder to +use, violating the desire to make the `LLM` class as easy as possible to use. + +Instead of pushing this on our users, we will retain the complexity to do our +best to make things work. + +## Future Work + +We may want to consider a different worker management approach in the future +that works around these challenges. + +1. We could implement something `forkserver`-like, but have the process manager + be something we initially launch by running our own subprocess and a custom + entrypoint for worker management (launch a `vllm-manager` process). + +2. We can explore other libraries that may better suit our needs. Examples to + consider: + +- diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 0c1afcbd7c0b9..d6c83014dc69f 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -136,6 +136,62 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. +Python multiprocessing +---------------------- + +`RuntimeError` Exception +^^^^^^^^^^^^^^^^^^^^^^^^ + +If you have seen a warning in your logs like this: + +.. code-block:: console + + WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. + +or an error from Python that looks like this: + +.. code-block:: console + + RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. + + To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html + +then you must update your Python code to guard usage of ``vllm`` behind a ``if +__name__ == '__main__':`` block. For example, instead of this: + +.. code-block:: python + + import vllm + + llm = vllm.LLM(...) + +try this instead: + +.. code-block:: python + + if __name__ == '__main__': + import vllm + + llm = vllm.LLM(...) + Known Issues ---------------------------------------- - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/docs/source/index.rst b/docs/source/index.rst index 842013d6d49c4..8ac09f6988893 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -173,6 +173,7 @@ Documentation design/input_processing/model_inputs_index design/kernel/paged_attention design/multimodal/multimodal_index + design/multiprocessing .. For Developers: contributing to the vLLM project diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 11b2574ce42dd..58ab892676b9a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -232,6 +232,10 @@ def __init__( self.request_counter = Counter() + def __del__(self): + if self.llm_engine and hasattr(self.llm_engine, "shutdown"): + self.llm_engine.shutdown() + @staticmethod def get_engine_class() -> Type[LLMEngine]: if envs.VLLM_USE_V1: diff --git a/vllm/envs.py b/vllm/envs.py index bc8c1499e9534..da17b747ea215 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -69,7 +69,7 @@ VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False - VLLM_ENABLE_V1_MULTIPROCESSING: bool = False + VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -460,7 +460,7 @@ def get_default_config_root(): # If set, enable multiprocessing in LLM for the V1 code path. "VLLM_ENABLE_V1_MULTIPROCESSING": - lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))), + lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), } diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index fe475db6d3f57..c4d90f0856f86 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -274,7 +274,20 @@ def write_with_prefix(s: str): file.write = write_with_prefix # type: ignore[method-assign] +def _check_multiproc_method(): + if (cuda_is_initialized() + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): + logger.warning("CUDA was previously initialized. We must use " + "the `spawn` multiprocessing start method. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "See https://docs.vllm.ai/en/latest/getting_started/" + "debugging.html#python-multiprocessing " + "for more information.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + def get_mp_context(): + _check_multiproc_method() mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD return multiprocessing.get_context(mp_method) @@ -284,12 +297,7 @@ def set_multiprocessing_worker_envs(parallel_config): in a multiprocessing environment. This should be called by the parent process before worker processes are created""" - if (cuda_is_initialized() - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): - logger.warning("CUDA was previously initialized. We must use " - "the `spawn` multiprocessing start method. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + _check_multiproc_method() # Configure thread parallelism if OMP_NUM_THREADS isn't set # diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6246a0067842a..ee7419bce2565 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,4 +1,3 @@ -import multiprocessing import pickle import queue import signal @@ -13,6 +12,7 @@ from msgspec import msgpack from vllm.config import CacheConfig, VllmConfig +from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.v1.core.scheduler import Scheduler @@ -210,11 +210,7 @@ def make_engine_core_process( output_path: str, ready_path: str, ) -> EngineCoreProcHandle: - # The current process might have CUDA context, - # so we need to spawn a new process. - # NOTE(rob): this is a problem for using EngineCoreProc w/ - # LLM, since we need a if __name__ == "__main__" guard. - context = multiprocessing.get_context("spawn") + context = get_mp_context() process_kwargs = { "input_path": input_path, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a66ae111be8c5..e0bfe1b93b360 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -159,10 +159,16 @@ def __init__( atexit.register(self.shutdown) def shutdown(self): + # During final garbage collection in process shutdown, atexit may be + # None. + if atexit: + # in case shutdown gets called via __del__ first + atexit.unregister(self.shutdown) + # Shut down the zmq context. self.ctx.destroy(linger=0) - if hasattr(self, "proc_handle"): + if hasattr(self, "proc_handle") and self.proc_handle: # Shutdown the process if needed. if self.proc_handle.proc.is_alive(): self.proc_handle.proc.terminate() @@ -178,8 +184,9 @@ def shutdown(self): ] for ipc_socket in ipc_sockets: socket_file = ipc_socket.replace("ipc://", "") - if os.path.exists(socket_file): + if os and os.path.exists(socket_file): os.remove(socket_file) + self.proc_handle = None def __del__(self): self.shutdown() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1b3a9f12d009e..c02494897b41f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -196,3 +196,10 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group + + def __del__(self): + self.shutdown() + + def shutdown(self): + if engine_core := getattr(self, "engine_core", None): + engine_core.shutdown() diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 63a12f791051f..14384a730ceec 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -163,6 +163,10 @@ def _ensure_worker_termination(self): termination and kill signals if needed.""" def wait_for_termination(procs, timeout): + if not time: + # If we are in late stage shutdown, the interpreter may replace + # `time` with `None`. + return all(not proc.is_alive() for proc in procs) start_time = time.time() while time.time() - start_time < timeout: if all(not proc.is_alive() for proc in procs): @@ -187,10 +191,14 @@ def _cleanup_sockets(self): for w in self.workers: # Remove the zmq ipc socket file socket_path = w.ready_path.replace("ipc://", "") - if os.path.exists(socket_path): + if os and os.path.exists(socket_path): os.remove(socket_path) def shutdown(self): + if atexit: + # in case shutdown was called explicitly, we don't need to call it + # again + atexit.unregister(self.shutdown) """Properly shut down the executor and its workers""" if (hasattr(self, 'workers') and self.workers is not None): for w in self.workers: #TODO: not sure if needed From 4b5b8a6a3bd94d9b0248b36b0eb4739d76fbb386 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 13 Dec 2024 20:02:35 -0500 Subject: [PATCH 20/56] [V1][Bugfix] Fix EngineCoreProc profile (#11185) Signed-off-by: Tyler Michael Smith --- vllm/v1/engine/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ee7419bce2565..dc8c1d39eefa9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -321,7 +321,7 @@ def _handle_client_request( if isinstance(request, EngineCoreRequest): self.add_request(request) elif isinstance(request, EngineCoreProfile): - self.model_executor.worker.profile(request.is_start) + self.model_executor.profile(request.is_start) else: # TODO: make an EngineCoreAbort wrapper assert isinstance(request, list) From 9855aea21b6aec48b12cef3a1614e7796b970a73 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Fri, 13 Dec 2024 17:08:23 -0800 Subject: [PATCH 21/56] [Bugfix][V1] Re-compute an entire block when fully cache hit (#11186) Signed-off-by: Cody Yu --- vllm/v1/core/scheduler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index a3e85c20cc664..f055eed77c372 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -199,9 +199,13 @@ def schedule(self) -> "SchedulerOutput": if num_new_tokens == 0: # The happens when prompt length is divisible by the block # size and all blocks are cached. Now we force to recompute - # the last token. - num_computed_tokens -= 1 - num_new_tokens = 1 + # the last block. Note that we have to re-compute an entire + # block because allocate_slots() assumes num_computed_tokens + # is always a multiple of the block size. This limitation + # can potentially be removed in the future to slightly + # improve the performance. + num_computed_tokens -= self.block_size + num_new_tokens = self.block_size computed_blocks.pop() num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 From 24a3d12b821a081850c1659f61762e799eeba902 Mon Sep 17 00:00:00 2001 From: dhuangnm <74931910+dhuangnm@users.noreply.github.com> Date: Fri, 13 Dec 2024 22:22:44 -0500 Subject: [PATCH 22/56] update compressed-tensors to latest version (#11183) Co-authored-by: dhuangnm --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index 11984260c580d..71c5b122d7c42 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -33,5 +33,5 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.0 # required for compressed-tensors +compressed-tensors == 0.8.1 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging torch.compile From 48259264a4012e756215adc87e3682bf1e7dfee9 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sat, 14 Dec 2024 02:46:18 -0500 Subject: [PATCH 23/56] [Core] Update outlines and increase its threadpool size (#11140) Signed-off-by: Russell Bryant --- requirements-common.txt | 2 +- .../guided_decoding/outlines_decoding.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 71c5b122d7c42..bd2b4b7a01668 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.9 +outlines == 0.1.11 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index 8a7ff38bfeb1a..eb8db882435e6 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,5 +1,6 @@ import asyncio import concurrent.futures +import os from enum import Enum from json import dumps as json_dumps from re import escape as regex_escape @@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum): global_thread_pool = None # used for generating logits processor fsm +# It's not yet clear that using more provides a benefit, and it could +# potentially starve other processes on the machine. We'll cap this for now and +# adjust later if testing proves it to help overcome a bottleneck. +_MAX_THREADPOOL_WORKERS = 16 + async def get_outlines_guided_decoding_logits_processor( guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase @@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor( return None if global_thread_pool is None: + max_workers = os.cpu_count() or 2 + if max_workers > _MAX_THREADPOOL_WORKERS: + max_workers = _MAX_THREADPOOL_WORKERS global_thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=2) + max_workers=max_workers) loop = asyncio.get_running_loop() return await loop.run_in_executor(global_thread_pool, From ea7bd68d101884165ffd75c1fd6e94a97510f194 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Sat, 14 Dec 2024 03:21:23 -0500 Subject: [PATCH 24/56] [V1][Bugfix] Fix V1 TP trust-remote-code (#11182) Signed-off-by: Tyler Michael Smith --- vllm/v1/engine/core.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index dc8c1d39eefa9..af644fb5fedba 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -14,6 +14,8 @@ from vllm.config import CacheConfig, VllmConfig from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, @@ -241,6 +243,9 @@ def run_engine_core(*args, **kwargs): # processes to terminate without error shutdown_requested = False + # Ensure we can serialize transformer config after spawning + maybe_register_config_serialize_by_value() + def signal_handler(signum, frame): nonlocal shutdown_requested if not shutdown_requested: From 3cb5769883fa104e42248f2b3f41a310947f357c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 15 Dec 2024 00:38:27 +0800 Subject: [PATCH 25/56] [Misc] Minor improvements to the readability of PunicaWrapperBase (#11200) Signed-off-by: Jee Jee Li --- vllm/lora/punica_wrapper/punica_base.py | 14 +++++----- vllm/lora/punica_wrapper/punica_gpu.py | 34 ++++++++++++------------- vllm/lora/punica_wrapper/punica_hpu.py | 4 +-- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 0a5a84bdd8deb..b9ec0c4bc6323 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -63,7 +63,7 @@ def add_expand( lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], output_slices: Tuple[int, ...], offset_start: int = 0, - add_input=True, + add_inputs=True, **kwargs, ) -> None: """ @@ -77,7 +77,7 @@ def add_lora_embedding( y: torch.Tensor, x: torch.Tensor, lora_b_stacked: torch.Tensor, - add_input: bool = True, + add_inputs: bool = True, **kwargs, ) -> None: """ @@ -367,12 +367,13 @@ def add_expand(self, lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], output_slices: Tuple[int, ...], offset_start: int = 0, - add_input=True, + add_inputs=True, **kwargs) -> None: """ Performs GEMM and bias addition for multiple slices of lora_b. Semantics: + offset = offset_start for i in range(len(lora_b_stacked)): slice = output_slices[i] y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + @@ -386,7 +387,8 @@ def add_expand(self, lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight output_slices (Tuple[int, ...]): Every slice's size - add_input (bool): Defaults to True. + offset_start (int): The starting position of y, defaults to 0 + add_inputs (bool): Defaults to True. """ # TODO: implement it based on torch ops @@ -397,7 +399,7 @@ def add_lora_embedding(self, y: torch.Tensor, x: torch.Tensor, lora_b_stacked: torch.Tensor, - add_input: bool = True, + add_inputs: bool = True, **kwargs) -> None: """ Applies lora specifically for VocabParallelEmbeddingWithLoRA. @@ -409,7 +411,7 @@ def add_lora_embedding(self, y (torch.Tensor): Output tensor. x (torch.Tensor): Input tensor. lora_b_stacked (torch.Tensor): lora_b's weights. - add_input (bool): Default to True. + add_inputs (bool): Default to True. """ # TODO: implement it based on torch ops raise NotImplementedError diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index b2af29de129ce..de378df8b3cfa 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -67,7 +67,7 @@ def _expand_prefill( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, - add_input: bool, + add_inputs: bool, ): #No LoRA request, so return directly if self.no_lora: @@ -77,7 +77,7 @@ def _expand_prefill( w_t_all, y, *self.prefill_metadata, - add_input, + add_inputs, ) def _expand_decode( @@ -85,9 +85,9 @@ def _expand_decode( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, - add_input: bool, + add_inputs: bool, ): - bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) + bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs) def _expand_slice_prefill( self, @@ -96,7 +96,7 @@ def _expand_slice_prefill( w_t_all: torch.Tensor, y_offset: Optional[int], y_slice_size: Optional[int], - add_input: bool, + add_inputs: bool, ): #No LoRA request, so return directly if self.no_lora: @@ -108,7 +108,7 @@ def _expand_slice_prefill( *self.prefill_metadata, y_offset, y_slice_size, - add_input, + add_inputs, ) def _expand_slice_decode( @@ -118,10 +118,10 @@ def _expand_slice_decode( w_t_all: torch.Tensor, y_offset: Optional[int], y_slice_size: Optional[int], - add_input: bool, + add_inputs: bool, ): bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, - y_slice_size, add_input) + y_slice_size, add_inputs) def _apply_expand( self, @@ -130,7 +130,7 @@ def _apply_expand( w_t_all: torch.Tensor, y_offset: Optional[int], y_slice_size: Optional[int], - add_input: bool = True, + add_inputs: bool = True, ): """ Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` @@ -141,7 +141,7 @@ def _apply_expand( expand_slice_fun: Callable = (self._expand_slice_prefill if self.is_prefill else self._expand_slice_decode) - expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs) def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, scale: float): @@ -194,7 +194,7 @@ def add_expand(self, lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], output_slices: Tuple[int, ...], offset_start: int = 0, - add_input=True, + add_inputs=True, **kwargs) -> None: """ Performs GEMM and bias addition for multiple slices of lora_b. @@ -213,7 +213,7 @@ def add_expand(self, lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight output_slices (Tuple[int, ...]): Every slice's size - add_input (bool): Defaults to True. + add_inputs (bool): Defaults to True. """ y_org = y y = y.view(-1, y.shape[-1]) @@ -228,7 +228,7 @@ def add_expand(self, lora_b_stacked[slice_idx], offset_left, output_slices[slice_idx], - add_input=add_input, + add_inputs=add_inputs, ) offset_left += output_slices[slice_idx] y = y.view_as(y_org) @@ -237,7 +237,7 @@ def add_lora_embedding(self, y: torch.Tensor, x: torch.Tensor, lora_b_stacked: torch.Tensor, - add_input: bool = True, + add_inputs: bool = True, **kwargs) -> None: """ Applies lora specifically for VocabParallelEmbeddingWithLoRA. @@ -249,13 +249,13 @@ def add_lora_embedding(self, y (torch.Tensor): Output tensor. x (torch.Tensor): Input tensor. lora_b_stacked (torch.Tensor): lora_b's weights. - add_input (bool): Default to True. + add_inputs (bool): Default to True. """ # Embedding layer only need expand op expand_fun: Callable = (self._expand_prefill if self.is_prefill else self._expand_decode) - expand_fun(y, x, lora_b_stacked, add_input) + expand_fun(y, x, lora_b_stacked, add_inputs) def add_lora_linear(self, y: torch.Tensor, @@ -311,7 +311,7 @@ def add_lora_linear(self, lora_b_stacked, None, output_slices, - add_input=True, + add_inputs=True, **kwargs) def add_lora_logits(self, diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index 996325b712996..d9c4f44a1c282 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -21,7 +21,7 @@ def add_lora_embedding(self, y: torch.Tensor, x: torch.Tensor, lora_b_stacked: torch.Tensor, - add_input: bool = True, + add_inputs: bool = True, **kwargs) -> None: dispatch_bgmv_embedding(y, x, lora_b_stacked, 0) @@ -81,7 +81,7 @@ def add_expand( lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], output_slices: Tuple[int, ...], offset_start: int = 0, - add_input=True, + add_inputs=True, **kwargs, ) -> None: raise NotImplementedError From 9c3dadd1c97df2b37388c6898a0725457391f647 Mon Sep 17 00:00:00 2001 From: Brad Hilton Date: Sat, 14 Dec 2024 09:46:42 -0700 Subject: [PATCH 26/56] [Frontend] Add `logits_processors` as an extra completion argument (#11150) Signed-off-by: Brad Hilton --- tests/entrypoints/openai/test_serving_chat.py | 1 + vllm/config.py | 71 +++++++++-------- vllm/engine/arg_utils.py | 11 ++- vllm/entrypoints/openai/protocol.py | 77 ++++++++++++++++++- vllm/entrypoints/openai/serving_chat.py | 3 +- vllm/entrypoints/openai/serving_completion.py | 3 +- 6 files changed, 127 insertions(+), 39 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 93660e6118ca8..5b40a04db15ee 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -30,6 +30,7 @@ class MockModelConfig: tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() + logits_processor_pattern = None @dataclass diff --git a/vllm/config.py b/vllm/config.py index 12ed80c366e43..37d062f7eb079 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -156,41 +156,45 @@ class ModelConfig: can not be gathered from the vllm arguments. override_pooler_config: Initialize non default pooling config or override default pooling config for the pooling model. + logits_processor_pattern: Optional regex pattern specifying valid + logits processor qualified names that can be passed with the + `logits_processors` extra completion argument. Defaults to None, + which allows no processors. """ - def __init__( - self, - model: str, - task: Union[TaskOption, Literal["draft"]], - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - allowed_local_media_path: str = "", - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[Dict[str, Any]] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - quantization_param_path: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, - use_async_output_proc: bool = True, - config_format: ConfigFormat = ConfigFormat.AUTO, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - mm_cache_preprocessor: bool = False, - override_neuron_config: Optional[Dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None) -> None: + def __init__(self, + model: str, + task: Union[TaskOption, Literal["draft"]], + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + allowed_local_media_path: str = "", + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + quantization_param_path: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + config_format: ConfigFormat = ConfigFormat.AUTO, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + mm_cache_preprocessor: bool = False, + override_neuron_config: Optional[Dict[str, Any]] = None, + override_pooler_config: Optional["PoolerConfig"] = None, + logits_processor_pattern: Optional[str] = None) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -316,6 +320,7 @@ def __init__( self.task: Final = task self.pooler_config = self._init_pooler_config(override_pooler_config) + self.logits_processor_pattern = logits_processor_pattern self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0098648b1cd60..5a73c6ee02e0c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -170,6 +170,7 @@ class EngineArgs: enable_chunked_prefill: Optional[bool] = None guided_decoding_backend: str = 'xgrammar' + logits_processor_pattern: Optional[str] = None # Speculative decoding configuration. speculative_model: Optional[str] = None speculative_model_quantization: Optional[str] = None @@ -374,6 +375,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'https://github.com/noamgat/lm-format-enforcer.' ' Can be overridden per request via guided_decoding_backend' ' parameter.') + parser.add_argument( + '--logits-processor-pattern', + type=nullable_str, + default=None, + help='Optional regex pattern specifying valid logits processor ' + 'qualified names that can be passed with the `logits_processors` ' + 'extra completion argument. Defaults to None, which allows no ' + 'processors.') # Parallel arguments parser.add_argument( '--distributed-executor-backend', @@ -975,7 +984,7 @@ def create_model_config(self) -> ModelConfig: mm_cache_preprocessor=self.mm_cache_preprocessor, override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, - ) + logits_processor_pattern=self.logits_processor_pattern) def create_load_config(self) -> LoadConfig: return LoadConfig( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f4e7740ea0cff..dfb7c977dbd43 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1,5 +1,6 @@ # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py +import re import time from argparse import Namespace from typing import Any, Dict, List, Literal, Optional, Union @@ -14,7 +15,7 @@ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, RequestOutputKind, SamplingParams) from vllm.sequence import Logprob -from vllm.utils import random_uuid +from vllm.utils import random_uuid, resolve_obj_by_qualname logger = init_logger(__name__) @@ -148,6 +149,46 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): type: Literal["function"] = "function" +class LogitsProcessorConstructor(BaseModel): + qualname: str + args: Optional[List[Any]] = None + kwargs: Optional[Dict[str, Any]] = None + + +LogitsProcessors = List[Union[str, LogitsProcessorConstructor]] + + +def get_logits_processors(processors: Optional[LogitsProcessors], + pattern: Optional[str]) -> Optional[List[Any]]: + if processors and pattern: + logits_processors = [] + for processor in processors: + qualname = processor if isinstance(processor, + str) else processor.qualname + if not re.match(pattern, qualname): + raise ValueError( + f"Logits processor '{qualname}' is not allowed by this " + "server. See --logits-processor-pattern engine argument " + "for more information.") + try: + logits_processor = resolve_obj_by_qualname(qualname) + except Exception as e: + raise ValueError( + f"Logits processor '{qualname}' could not be resolved: {e}" + ) from e + if isinstance(processor, LogitsProcessorConstructor): + logits_processor = logits_processor(*processor.args or [], + **processor.kwargs or {}) + logits_processors.append(logits_processor) + return logits_processors + elif processors: + raise ValueError( + "The `logits_processors` argument is not supported by this " + "server. See --logits-processor-pattern engine argugment " + "for more information.") + return None + + class ChatCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/chat/create @@ -293,6 +334,17 @@ class ChatCompletionRequest(OpenAIBaseModel): "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " "through out the inference process and return in response.")) + logits_processors: Optional[LogitsProcessors] = Field( + default=None, + description=( + "A list of either qualified names of logits processors, or " + "constructor objects, to apply when sampling. A constructor is " + "a JSON object with a required 'qualname' field specifying the " + "qualified name of the processor class/factory, and optional " + "'args' and 'kwargs' fields containing positional and keyword " + "arguments. For example: {'qualname': " + "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " + "{'param': 'value'}}.")) # doc: end-chat-completion-extra-params @@ -314,7 +366,9 @@ def to_beam_search_params(self, length_penalty=self.length_penalty, include_stop_str_in_output=self.include_stop_str_in_output) - def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: + def to_sampling_params( + self, default_max_tokens: int, + logits_processor_pattern: Optional[str]) -> SamplingParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens if max_tokens is None: @@ -364,6 +418,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: min_tokens=self.min_tokens, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, + logits_processors=get_logits_processors(self.logits_processors, + logits_processor_pattern), include_stop_str_in_output=self.include_stop_str_in_output, truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ @@ -599,6 +655,17 @@ class CompletionRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) + logits_processors: Optional[LogitsProcessors] = Field( + default=None, + description=( + "A list of either qualified names of logits processors, or " + "constructor objects, to apply when sampling. A constructor is " + "a JSON object with a required 'qualname' field specifying the " + "qualified name of the processor class/factory, and optional " + "'args' and 'kwargs' fields containing positional and keyword " + "arguments. For example: {'qualname': " + "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " + "{'param': 'value'}}.")) # doc: end-completion-extra-params @@ -619,7 +686,9 @@ def to_beam_search_params(self, length_penalty=self.length_penalty, include_stop_str_in_output=self.include_stop_str_in_output) - def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: + def to_sampling_params( + self, default_max_tokens: int, + logits_processor_pattern: Optional[str]) -> SamplingParams: max_tokens = self.max_tokens if max_tokens is None: max_tokens = default_max_tokens @@ -665,6 +734,8 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, + logits_processors=get_logits_processors(self.logits_processors, + logits_processor_pattern), truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a5e7b4ac3bb30..527418c635093 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -197,7 +197,8 @@ async def create_chat_completion( default_max_tokens) else: sampling_params = request.to_sampling_params( - default_max_tokens) + default_max_tokens, + self.model_config.logits_processor_pattern) self._log_inputs(request_id, request_prompts[i], diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index b3436773062f3..bd39a4c42e938 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -123,7 +123,8 @@ async def create_completion( default_max_tokens) else: sampling_params = request.to_sampling_params( - default_max_tokens) + default_max_tokens, + self.model_config.logits_processor_pattern) request_id_item = f"{request_id}-{i}" From 93abf23a648051fe6dc053ba0b74499d119920bf Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 15 Dec 2024 01:52:18 +0800 Subject: [PATCH 27/56] [VLM] Fully dynamic prompt replacement in merged input processor (#11199) Signed-off-by: DarkLight1337 --- examples/offline_inference_vision_language.py | 5 +- .../mm_processor_kwargs/test_phi3v.py | 4 +- tests/multimodal/test_processing.py | 105 +-- .../vllm_add_dummy_model/my_llava.py | 4 +- vllm/inputs/registry.py | 71 +- vllm/model_executor/models/llava.py | 144 ++--- vllm/model_executor/models/phi3v.py | 118 ++-- vllm/model_executor/models/pixtral.py | 2 +- vllm/multimodal/base.py | 4 +- vllm/multimodal/processing.py | 606 +++++++++--------- vllm/multimodal/registry.py | 4 +- vllm/utils.py | 12 +- 12 files changed, 569 insertions(+), 510 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index c430f42fdc814..45539c665a922 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -97,9 +97,6 @@ def run_phi3v(question: str, modality: str): # max_model_len (128k) for this model may cause OOM. # You may lower either to run this example on lower-end GPUs. - # In this example, we override max_num_seqs to 5 while - # keeping the original context length of 128k. - # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended # to use 16 for single frame scenarios, and 4 for multi-frame. @@ -113,7 +110,7 @@ def run_phi3v(question: str, modality: str): # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 llm = LLM( - model="microsoft/Phi-3-vision-128k-instruct", + model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, max_model_len=4096, max_num_seqs=2, diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py index c16192a1e1438..ce8ac8d8e0ceb 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -16,8 +16,8 @@ # Wrap lazy imports to avoid initializing CUDA during test collection @pytest.fixture() def processor_for_phi3v(): - from vllm.model_executor.models.phi3v import Phi3VProcessor - return Phi3VProcessor + from vllm.model_executor.models.phi3v import Phi3VMultiModalProcessor + return Phi3VMultiModalProcessor @pytest.fixture() diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index ae668d1dd56c8..6aaa80ddc9fa5 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,11 +1,11 @@ from typing import cast import pytest -from transformers import BatchFeature -from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo, - find_text_matches, find_token_matches, - iter_placeholders, iter_token_matches, +from vllm.multimodal.processing import (MultiModalDataItems, PromptReplacement, + _PlaceholderInfo, find_text_matches, + find_token_matches, iter_placeholders, + iter_token_matches, replace_text_matches, replace_token_matches) from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -16,7 +16,7 @@ @pytest.mark.parametrize( ("token_ids", "match_ids", "expected"), [ - ([], [], [{ "start_idx": 0, "end_idx": 0 }]), + ([], [], []), ([], [32000], []), ( [32000, 32000, 32000], @@ -83,7 +83,7 @@ def test_iter_token_matches(token_ids, match_ids, expected): "pattern_2": [32000], }, { - "pattern_1": [{ "start_idx": 0, "end_idx": 0 }], + "pattern_1": [], "pattern_2": [], } ), @@ -136,7 +136,7 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key): mock_tokenizer = cast(AnyTokenizer, object()) prompt_repls = [ - PromptReplacement(target, [], 0).bind(key, mock_tokenizer) + PromptReplacement(key, target, []).bind(mock_tokenizer) for key, target in target_by_key.items() ] result = find_token_matches(prompt, prompt_repls) @@ -243,7 +243,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key): mock_tokenizer = cast(AnyTokenizer, object()) prompt_repls = [ - PromptReplacement(target, [], 0).bind(key, mock_tokenizer) + PromptReplacement(key, target, []).bind(mock_tokenizer) for key, target in target_by_key.items() ] result = find_text_matches(prompt, prompt_repls) @@ -276,12 +276,12 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key): "pattern_3": "!", }, { - # Test whether target is confused with repl_unit - "pattern_1": ("", 1), - # Test empty repl_unit - "pattern_2": ("", 1), - # Test multiple repl_count - "pattern_3": ("?", 2), + # Test whether target is confused with replacement + "pattern_1": "", + # Test empty replacement + "pattern_2": "", + # Test dynamic replacement (beyond the form of `unit * count`) + "pattern_3": "?!?", }, ), ] @@ -290,8 +290,8 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key): ("mm_count", "expected"), [ (0, "Image:Image:!"), - (1, "Image:??"), - (2, "??"), + (1, "Image:?!?"), + (2, "?!?"), ] ) # yapf: enable @@ -306,7 +306,7 @@ def test_find_replace_text( mock_tokenizer = cast(AnyTokenizer, object()) prompt_repls = [ - PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer) + PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) for key, target in target_by_key.items() ] matches = find_text_matches(prompt, prompt_repls) @@ -314,9 +314,8 @@ def test_find_replace_text( result = replace_text_matches( prompt, matches, - {key: list(range(mm_count)) - for key in repl_by_key}, - BatchFeature(), + MultiModalDataItems({key: [None] * mm_count + for key in repl_by_key}), ) # Only displayed on error @@ -343,12 +342,12 @@ def test_find_replace_text( "pattern_3": [918], }, { - # Test whether target is confused with repl_unit - "pattern_1": ([32000, 32000], 1), - # Test empty repl_unit - "pattern_2": ([], 1), - # Test multiple repl_count - "pattern_3": ([1550], 2), + # Test whether target is confused with replacement + "pattern_1": [32000, 32000], + # Test empty replacement + "pattern_2": [], + # Test dynamic replacement (beyond the form of `unit * count`) + "pattern_3": [1550, 918, 1550], }, ), ] @@ -357,8 +356,8 @@ def test_find_replace_text( ("mm_count", "expected"), [ (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]), - (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550]), - (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550]), + (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]), + (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]), ] ) # yapf: enable @@ -373,7 +372,7 @@ def test_find_replace_tokens( mock_tokenizer = cast(AnyTokenizer, object()) prompt_repls = [ - PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer) + PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) for key, target in target_by_key.items() ] matches = find_token_matches(prompt, prompt_repls) @@ -381,9 +380,8 @@ def test_find_replace_tokens( result = replace_token_matches( prompt, matches, - {key: list(range(mm_count)) - for key in repl_by_key}, - BatchFeature(), + MultiModalDataItems({key: [None] * mm_count + for key in repl_by_key}), ) # Only displayed on error @@ -399,9 +397,9 @@ def test_find_replace_tokens( "repl_by_key", [ { - "pattern_1": ([32000, 32000], 1), - "pattern_2": ([], 1), - "pattern_3": ([1550], 2), + "pattern_1": [32000, 32000], + "pattern_2": [], + "pattern_3": [1550, 918, 1550], }, ], ) @@ -414,48 +412,47 @@ def test_find_replace_tokens( _PlaceholderInfo( modality="pattern_1", start_idx=6, - unit=[32000, 32000], - unit_count=1, + replacement=[32000, 32000], ), ], ), ( - [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550], + [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], [ _PlaceholderInfo( modality="pattern_1", start_idx=1, - unit=[32000, 32000], - unit_count=1, + replacement=[32000, 32000], ), _PlaceholderInfo( modality="pattern_1", start_idx=5, - unit=[32000, 32000], - unit_count=1, + replacement=[32000, 32000], ), _PlaceholderInfo( modality="pattern_3", start_idx=7, - unit=[1550], - unit_count=2, + replacement=[1550, 918, 1550], ), ], ), ( - [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550], + [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], [ _PlaceholderInfo( modality="pattern_1", start_idx=1, - unit=[32000, 32000], - unit_count=2, + replacement=[32000, 32000], + ), + _PlaceholderInfo( + modality="pattern_1", + start_idx=3, + replacement=[32000, 32000], ), _PlaceholderInfo( modality="pattern_3", start_idx=6, - unit=[1550], - unit_count=2, + replacement=[1550, 918, 1550], ), ], ), @@ -470,11 +467,17 @@ def test_iter_placeholders( mock_tokenizer = cast(AnyTokenizer, object()) prompt_repls = [ - PromptReplacement([], *repl).bind(key, mock_tokenizer) + PromptReplacement(key, [], repl).bind(mock_tokenizer) for key, repl in repl_by_key.items() ] - result = list(iter_placeholders(prompt_repls, prompt)) + result = list( + iter_placeholders( + prompt_repls, + prompt, + # Effectively match all occurrences in the prompt + MultiModalDataItems({key: [None] * 3 for key in repl_by_key}), + )) # Only displayed on error print("result:", result) diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index 2f4194a63fc25..0d90635093ac7 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -3,14 +3,14 @@ import torch from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, - LlavaProcessor, + LlavaMultiModalProcessor, get_max_llava_image_tokens) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) -@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor) +@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) class MyLlava(LlavaForConditionalGeneration): def compute_logits( diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 0dfed3b7e61bf..0b85484c48714 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,7 +2,7 @@ from collections import UserDict from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple, - Optional, Protocol, Type, cast) + Optional, Protocol, Type) from torch import nn from transformers import PretrainedConfig, ProcessorMixin @@ -47,7 +47,6 @@ def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C: Raises: TypeError: If the model is not of the specified type. """ - hf_config = self.model_config.hf_config if not isinstance(hf_config, hf_config_type): raise TypeError("Invalid type of HuggingFace config. " @@ -60,21 +59,70 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]: """ Get the HuggingFace image processor configuration of the model. """ - return self.model_config.hf_image_processor_config + def get_mm_config(self): + """ + Get the multimodal config of the model. + + Raises: + RuntimeError: If the model is not a multimodal model. + """ + mm_config = self.model_config.multimodal_config + if mm_config is None: + raise RuntimeError("Not a multimodal model") + + return mm_config + + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + base_kwargs = self.model_config.mm_processor_kwargs + if base_kwargs is None: + base_kwargs = {} + + merged_kwargs = {**base_kwargs, **kwargs} + + return cached_get_processor( + self.model_config.model, + trust_remote_code=self.model_config.trust_remote_code, + **merged_kwargs, + ) + @dataclass(frozen=True) class InputProcessingContext(InputContext): tokenizer: AnyTokenizer """The tokenizer used to tokenize the inputs.""" - def get_hf_processor(self, **kwargs) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + base_kwargs = self.model_config.mm_processor_kwargs + if base_kwargs is None: + base_kwargs = {} + + merged_kwargs = {**base_kwargs, **kwargs} + return cached_get_processor( - self.model_config.tokenizer, + self.model_config.model, tokenizer=self.tokenizer, # Override the tokenizer with ours trust_remote_code=self.model_config.trust_remote_code, - **kwargs) + **merged_kwargs, + ) + + def resolve_hf_processor_call_kwargs( + self, + hf_processor: ProcessorMixin, + inference_kwargs: Mapping[str, object], + ) -> Mapping[str, object]: + assert callable(hf_processor) + + base_kwargs = self.model_config.mm_processor_kwargs + if base_kwargs is None: + base_kwargs = {} + + return resolve_mm_processor_kwargs( + base_kwargs, + inference_kwargs, + hf_processor, + ) N = TypeVar("N", bound=Type[nn.Module]) @@ -171,7 +219,8 @@ def register_dummy_data(self, factory: DummyDataFactory): """ def wrapper(model_cls: N) -> N: - if model_cls in self._dummy_factories_by_model_type: + if self._dummy_factories_by_model_type.contains(model_cls, + strict=True): logger.warning( "Model class %s already has dummy data " "registered to %s. It is overwritten by the new one.", @@ -195,7 +244,8 @@ def register_dummy_encoder_data(self, factory: DummyDataFactory): """ def wrapper(model_cls: N) -> N: - if model_cls in self._dummy_encoder_factories_by_model_type: + if self._dummy_encoder_factories_by_model_type.contains( + model_cls, strict=True): logger.warning( "Model class %s already has dummy encoder data " "registered to %s. It is overwritten by the new one.", @@ -305,7 +355,8 @@ def register_input_processor(self, processor: InputProcessor): """ def wrapper(model_cls: N) -> N: - if model_cls in self._input_processors_by_model_type: + if self._input_processors_by_model_type.contains(model_cls, + strict=True): logger.warning( "Model class %s already has input processor " "registered to %s. It is overwritten by the new one.", @@ -357,7 +408,7 @@ def process_input(self, model_config: "ModelConfig", # If it's empty, it'll fall back to the default kwarg values mm_processor_kwargs = resolve_mm_processor_kwargs( model_config.mm_processor_kwargs, - cast(Dict[str, Any], inputs.get("mm_processor_kwargs")), + inputs.get("mm_processor_kwargs", {}), # type: ignore processor, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 53eef72dd5f91..a2e404cf43238 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -5,10 +5,10 @@ import torch import torch.nn as nn -from PIL.Image import Image from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, ProcessorMixin, SiglipVisionConfig) +from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor from vllm.attention import AttentionMetadata @@ -21,11 +21,9 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, - ModalityProcessingMetadata, - MultiModalProcessingMetadata, + MultiModalDataItems, ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors @@ -33,7 +31,8 @@ get_max_clip_image_tokens) from .interfaces import SupportsMultiModal, SupportsPP from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, - get_max_pixtral_hf_image_tokens) + get_max_pixtral_hf_image_tokens, + get_pixtral_hf_image_feature_size) from .siglip import (SiglipVisionModel, dummy_image_for_siglip, get_max_siglip_image_tokens) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, @@ -115,62 +114,7 @@ def get_max_llava_image_tokens(ctx: InputContext): raise ValueError(f"Unexpected select feature strategy: {strategy}") -def dummy_mm_kwargs_for_llava(ctx: InputProcessingContext, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaConfig) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] - - if isinstance(vision_config, CLIPVisionConfig): - data = dummy_image_for_clip(vision_config, num_images) - elif isinstance(vision_config, SiglipVisionConfig): - data = dummy_image_for_siglip(vision_config, num_images) - elif isinstance(vision_config, PixtralVisionConfig): - data = dummy_image_for_pixtral_hf(vision_config, num_images) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - hf_processor = ctx.get_hf_processor() - image_processor = hf_processor.image_processor # type: ignore - hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt") - is_pixtral = isinstance(hf_processor, PixtralProcessor) - - return MultiModalKwargs( - **hf_inputs, - is_pixtral=torch.tensor(is_pixtral), - ) - - -def create_metadata_for_llava( - ctx: InputProcessingContext) -> MultiModalProcessingMetadata: - hf_config = ctx.get_hf_config(LlavaConfig) - image_token_id = hf_config.image_token_index - - def get_repl_count( - mm_items: list[Image], - hf_inputs: BatchFeature, - item_idx: int, - ) -> int: - return get_max_llava_image_tokens(ctx) - - return { - "image": - ModalityProcessingMetadata(prompt_repls=[ - PromptReplacement(target=[image_token_id], - repl_unit=[image_token_id], - repl_count=get_repl_count), - ]), - } - - -class LlavaProcessor(BaseMultiModalProcessor): - - def __init__(self, ctx: InputProcessingContext) -> None: - super().__init__( - ctx=ctx, - metadata=create_metadata_for_llava(ctx), - ) +class LlavaMultiModalProcessor(BaseMultiModalProcessor): def _patch_pixtral_processor(self, hf_processor: PixtralProcessor): if getattr(hf_processor, "__is_patched__", False): @@ -188,18 +132,72 @@ def preprocess(__self, *args, **kwargs): hf_processor.__is_patched__ = True # type: ignore - def _get_hf_processor(self) -> ProcessorMixin: + def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: hf_processor = self.ctx.get_hf_processor() + assert isinstance(hf_processor, (LlavaProcessor, PixtralProcessor)) if isinstance(hf_processor, PixtralProcessor): self._patch_pixtral_processor(hf_processor) return hf_processor - def _get_dummy_mm_kwargs( + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_inputs: BatchFeature, + mm_processor_kwargs: Mapping[str, object], + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config(LlavaConfig) + image_token_id = hf_config.image_token_index + + processor = self._get_hf_processor() + if isinstance(processor, PixtralProcessor): + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token + + vision_config = hf_config.vision_config + assert isinstance(vision_config, PixtralVisionConfig) + + def get_replacement_pixtral(item_idx: int): + image_size = mm_items.get_image_size(item_idx) + ( + num_width_tokens, + num_height_tokens, + ) = get_pixtral_hf_image_feature_size( + vision_config, + image_width=image_size.width, + image_height=image_size.height, + ) + + tokens = ([image_token] * num_width_tokens + + [image_break_token]) * num_height_tokens + tokens[-1] = image_end_token + + return "".join(tokens) + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement_pixtral, + ), + ] + + max_image_tokens = get_max_llava_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=[image_token_id] * max_image_tokens, + ) + ] + + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], - ) -> MultiModalKwargs: + ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config num_images = mm_counts["image"] @@ -215,11 +213,13 @@ def _get_dummy_mm_kwargs( raise NotImplementedError(msg) hf_processor = self._get_hf_processor() - image_processor = hf_processor.image_processor # type: ignore - hf_inputs = image_processor.preprocess(data['image'], - return_tensors="pt") + image_token = hf_processor.image_token - return MultiModalKwargs(**hf_inputs) + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=data, + mm_processor_kwargs={}, + ) class LlavaLikeConfig(Protocol): @@ -303,7 +303,7 @@ def init_vision_tower_for_llava( @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) -@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor) +@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { @@ -584,7 +584,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return loader.load_weights(weights) -class MantisProcessor(LlavaProcessor): +class MantisMultiModalProcessor(LlavaMultiModalProcessor): def _get_hf_processor(self) -> ProcessorMixin: try: @@ -604,6 +604,6 @@ def _get_hf_processor(self) -> ProcessorMixin: # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) -@MULTIMODAL_REGISTRY.register_processor(MantisProcessor) +@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 3c7854ce388ab..7ab06768ae612 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -32,13 +32,10 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, - ModalityProcessingMetadata, MultiModalDataDict, - MultiModalProcessingMetadata, + MultiModalDataItems, ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -305,64 +302,17 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -def get_max_phi3v_image_tokens(ctx: InputContext, - *, - num_crops: Optional[int] = None): - mm_processor_kwargs = {} - if num_crops is not None: - mm_processor_kwargs["num_crops"] = num_crops +def get_max_phi3v_image_tokens(ctx: InputContext) -> int: + processor = ctx.get_hf_processor() + image_processor = processor.image_processor # type: ignore - model_config = ctx.model_config - image_processor = cached_get_image_processor( - model_config.model, - trust_remote_code=model_config.trust_remote_code, - **mm_processor_kwargs, - ) - - num_tokens = image_processor.calc_num_image_tokens_from_image_size( + return image_processor.calc_num_image_tokens_from_image_size( width=MAX_IMAGE_FEATURE_SIZE_WIDTH, height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, ) - return num_tokens - - -def dummy_mm_kwargs_for_phi3v(ctx: InputProcessingContext, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - - data = dummy_image_for_clip( - CLIP_VIT_LARGE_PATCH14_336_CONFIG, - num_images, - image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) - - hf_processor = ctx.get_hf_processor() - image_processor = hf_processor.image_processor # type: ignore - hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt") - - return MultiModalKwargs(**hf_inputs) - - -def create_metadata_for_phi3v( - ctx: InputProcessingContext) -> MultiModalProcessingMetadata: - return { - "image": - ModalityProcessingMetadata(prompt_repls=[ - PromptReplacement(target=[_IMAGE_TOKEN_ID], - repl_unit=[_IMAGE_TOKEN_ID], - repl_count=get_max_phi3v_image_tokens(ctx)), - ]), - } - -class Phi3VProcessor(BaseMultiModalProcessor): - def __init__(self, ctx: InputProcessingContext) -> None: - super().__init__( - ctx=ctx, - metadata=create_metadata_for_phi3v(ctx), - ) +class Phi3VMultiModalProcessor(BaseMultiModalProcessor): def _get_hf_processor( self, @@ -389,15 +339,61 @@ def _apply_hf_processor( processed_outputs['input_ids'] = token_ids return processed_outputs - def _get_dummy_mm_kwargs( + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_inputs: BatchFeature, + mm_processor_kwargs: Mapping[str, object], + ) -> list[PromptReplacement]: + hf_processor = self._get_hf_processor() + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + image_processor = hf_processor.image_processor # type: ignore + + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + + def get_replacement_phi3v(item_idx: int): + image_size = mm_items.get_image_size(item_idx) + num_tokens = image_processor.calc_num_image_tokens_from_image_size( + width=image_size.width, + height=image_size.height, + ) + + return [_IMAGE_TOKEN_ID] * num_tokens + + return [ + PromptReplacement( + modality="image", + target=image_token, + replacement=get_replacement_phi3v, + ) for image_token in image_tokens[:max_images] + ] + + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], - ) -> MultiModalKwargs: - return dummy_mm_kwargs_for_phi3v(self.ctx, mm_counts) + ) -> ProcessorInputs: + num_images = mm_counts["image"] + + data = dummy_image_for_clip( + CLIP_VIT_LARGE_PATCH14_336_CONFIG, + num_images, + image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, + image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + ) + + hf_processor = self._get_hf_processor() + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + + return ProcessorInputs( + prompt_text="".join(image_tokens[:num_images]), + mm_data=data, + mm_processor_kwargs={}, + ) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) -@MULTIMODAL_REGISTRY.register_processor(Phi3VProcessor) +@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 94a4ab882c1a9..161d6b41bfa5f 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -72,7 +72,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder image_token_id = mm_encoder.special_ids.img - mm_config = ctx.model_config.multimodal_config + mm_config = ctx.get_mm_config() num_images = mm_config.limit_per_prompt.get("image", 1) # dummy size diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7dba94b885b6d..fe77a4635f7d8 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -99,7 +99,7 @@ def register_input_mapper( """ def wrapper(model_cls: N) -> N: - if model_cls in self._input_mappers: + if self._input_mappers.contains(model_cls, strict=True): logger.warning( "Model class %s already has an input mapper " "registered to %s. It is overwritten by the new one.", @@ -194,7 +194,7 @@ def register_max_multimodal_tokens( """ def wrapper(model_cls: N) -> N: - if model_cls in self._max_mm_tokens: + if self._max_mm_tokens.contains(model_cls, strict=True): logger.warning( "Model class %s already calculates maximum number of " "tokens in %s. It is overwritten by the new one.", diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 922c83b6fd8a9..de5a002d474c2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,116 +1,59 @@ import re from abc import ABC, abstractmethod +from collections import UserDict from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import lru_cache -from typing import (Any, Dict, Generic, NamedTuple, Optional, Protocol, - TypeVar, Union, cast) +from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union +import numpy as np import torch +from PIL.Image import Image from transformers import BatchFeature, ProcessorMixin -from typing_extensions import TypeAlias, TypedDict +from typing_extensions import assert_never from vllm.inputs import DummyData, InputProcessingContext +from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import (flatten_2d_lists, full_groupby, is_list_of, - resolve_mm_processor_kwargs) +from vllm.utils import flatten_2d_lists, full_groupby, is_list_of from .inputs import (AudioItem, ImageItem, MultiModalDataDict, MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, VideoItem) +logger = init_logger(__name__) -def bind_prompt_sequence( - seq: Union[str, list[int]], - tokenizer: AnyTokenizer, -) -> "_BoundPromptSequence": - """ - Bind a text or token sequence to a tokenizer so that it can be - lazily converted into the other format on demand. - """ - return _BoundPromptSequence( - tokenizer=tokenizer, - _text=seq if isinstance(seq, str) else None, - _token_ids=seq if isinstance(seq, list) else None, - ) - - -_T = TypeVar("_T") _S = TypeVar("_S", str, list[int]) +_PromptSeq = Union[str, list[int]] @dataclass -class PromptReplacement(Generic[_S, _T]): - target: _S - """The text or token sequence to find and replace.""" +class PromptReplacement: + modality: str + """The modality for which the replacement is made""" - repl_unit: _S - """ - The unit making up the replacement text or token sequence. - - See :code:`repl_count` for more details. - """ + target: _PromptSeq + """The text or token sequence to find and replace.""" - repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int] + replacement: Union[Callable[[int], _PromptSeq], + _PromptSeq] = field(repr=False) """ - Given the original multi-modal items for this modality, HF-processed data, - and index of the processed item, output the number of repetitions of - :code:`repl_unit` to build up the replacement text or token sequence. + Given the index of the processed item within :attr:`modality`, output the + replacement text or token sequence. - For convenience, you can pass in an integer if the number of repetitions is - a constant. + For convenience, you can pass in the replacement instead of a function + if it does not depend on the input. """ - def __repr__(self) -> str: - return (f"{type(self).__name__}(target={self.target!r}, " - f"repl_unit={self.repl_unit!r})") - - def bind( - self, - modality: str, - tokenizer: AnyTokenizer, - ) -> "_BoundPromptReplacement[_T]": + def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement": return _BoundPromptReplacement( - modality=modality, - target=bind_prompt_sequence(self.target, tokenizer), - repl_unit=bind_prompt_sequence(self.repl_unit, tokenizer), - repl_count=self.repl_count, + tokenizer=tokenizer, + modality=self.modality, + _target=self.target, + _replacement=self.replacement, ) -@dataclass -class ModalityProcessingMetadata(Generic[_T]): - prompt_repls: Sequence[Union[PromptReplacement[str, _T], - PromptReplacement[list[int], _T]]] - """ - Defines each text or token sequence to replace in the HF-processed prompt. - - This is skipped if the HF-processed prompt is found to already contain - the replacement prompts. - """ - - -class MultiModalProcessingMetadataBuiltins(TypedDict, total=False): - """Type annotations for modality types predefined by vLLM.""" - - image: ModalityProcessingMetadata[ImageItem] - video: ModalityProcessingMetadata[VideoItem] - audio: ModalityProcessingMetadata[AudioItem] - - -MultiModalProcessingMetadata: TypeAlias = \ - Mapping[str, ModalityProcessingMetadata[Any]] -""" -A dictionary containing an entry for each modality type to process. - -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin - is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. -""" - - def _encode( tokenizer: AnyTokenizer, text: str, @@ -185,7 +128,8 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: @dataclass class _BoundPromptSequence: - tokenizer: AnyTokenizer + tokenizer: AnyTokenizer = field(repr=False) + _text: Optional[str] _token_ids: Optional[list[int]] @@ -210,38 +154,92 @@ def token_ids(self) -> list[int]: return self._token_ids - def __repr__(self) -> str: - return (f"{type(self).__name__}(_text={self._text!r}, " - f"_token_ids={self._token_ids!r})") - @dataclass -class _BoundPromptReplacement(Generic[_T]): +class _BoundPromptReplacement: + tokenizer: AnyTokenizer = field(repr=False) modality: str - target: _BoundPromptSequence - repl_unit: _BoundPromptSequence - repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int] - def get_count( - self, - mm_items: list[_T], - hf_inputs: BatchFeature, - item_idx: int, - ) -> int: - repl_count = self.repl_count - if isinstance(repl_count, int): - return repl_count + _target: _PromptSeq + _replacement: Union[Callable[[int], _PromptSeq], + _PromptSeq] = field(repr=False) - return repl_count(mm_items, hf_inputs, item_idx) + def __post_init__(self) -> None: + self._replacement_cache = dict[int, _BoundPromptSequence]() + + @property + def target(self) -> _BoundPromptSequence: + target = self._target + return _BoundPromptSequence( + tokenizer=self.tokenizer, + _text=target if isinstance(target, str) else None, + _token_ids=target if isinstance(target, list) else None, + ) -def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]: + def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + replacement = self._replacement + if callable(replacement): + cache_key = item_idx + if cache_key in self._replacement_cache: + return self._replacement_cache[cache_key] + + replacement = replacement(item_idx) + else: + cache_key = None + + bound_replacement = _BoundPromptSequence( + tokenizer=self.tokenizer, + _text=replacement if isinstance(replacement, str) else None, + _token_ids=replacement if isinstance(replacement, list) else None, + ) + + if cache_key is not None: + self._replacement_cache[cache_key] = bound_replacement + + return bound_replacement + + +class ImageSize(NamedTuple): + width: int + height: int + + +class MultiModalDataItems(UserDict[str, list[Any]]): """ - Convert a :class:`MultiModalDataDict` containing single data items - to a :class:`MultiModalMultiDataDict` containing multiple data items - per entry. + As :class:`MultiModalDataDict`, but normalized such that each entry + corresponds to a list. """ - multi_data = dict[str, list[Any]]() + + @property + def image(self) -> list[ImageItem]: + return self["image"] + + @property + def video(self) -> list[VideoItem]: + return self["video"] + + @property + def audio(self) -> list[AudioItem]: + return self["audio"] + + def get_image_size(self, item_idx: int) -> ImageSize: + image = self.image[item_idx] + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + + +def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems: + """ + Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. + """ + multi_data = MultiModalDataItems() for k, v in data.items(): # yapf: disable @@ -266,22 +264,33 @@ def iter_token_matches( token_ids: list[int], match_ids: list[int], ) -> Iterable[_TokenMatch]: - """Yield each occurrence of :code:`match_ids` in :code:`token_ids`.""" + """ + Yield each occurrence of :code:`match_ids` in :code:`token_ids`. + + Note that empty matches are ignored. + """ + prompt_len = len(token_ids) match_len = len(match_ids) - last_end_idx = 0 - for start_idx in range(len(token_ids) - match_len + 1): - if start_idx < last_end_idx: - continue # Exclude overlapping matches + if match_len == 0: + return + start_idx = 0 + while start_idx < prompt_len - match_len + 1: end_idx = start_idx + match_len + if token_ids[start_idx:end_idx] == match_ids: yield _TokenMatch(start_idx=start_idx, end_idx=end_idx) - last_end_idx = end_idx + + # Exclude overlapping matches + start_idx = end_idx + else: + start_idx += 1 -class _PromptReplacementMatch(ABC, Generic[_T, _S]): - prompt_repl: _BoundPromptReplacement[_T] +@dataclass(repr=False) +class _PromptReplacementMatch(ABC): + prompt_repl: _BoundPromptReplacement @property def modality(self) -> str: @@ -297,19 +306,13 @@ def start_idx(self) -> int: def end_idx(self) -> int: raise NotImplementedError - @property - @abstractmethod - def repl_unit(self) -> _S: - raise NotImplementedError - def __repr__(self) -> str: return (f"{type(self).__name__}(modality={self.modality!r}, " f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})") @dataclass(repr=False) -class _PromptReplacementTokenMatch(_PromptReplacementMatch[_T, list[int]]): - prompt_repl: _BoundPromptReplacement[_T] +class _PromptReplacementTokenMatch(_PromptReplacementMatch): match: _TokenMatch @property @@ -320,14 +323,9 @@ def start_idx(self) -> int: def end_idx(self) -> int: return self.match.end_idx - @property - def repl_unit(self) -> list[int]: - return self.prompt_repl.repl_unit.token_ids - @dataclass(repr=False) -class _PromptReplacementTextMatch(_PromptReplacementMatch[_T, str]): - prompt_repl: _BoundPromptReplacement[_T] +class _PromptReplacementTextMatch(_PromptReplacementMatch): match: re.Match[str] @property @@ -338,20 +336,15 @@ def start_idx(self) -> int: def end_idx(self) -> int: return self.match.end() - @property - def repl_unit(self) -> str: - return self.prompt_repl.repl_unit.text - class _PlaceholderInfo(NamedTuple): modality: str start_idx: int - unit: list[int] - unit_count: int + replacement: list[int] @property def length(self) -> int: - return len(self.unit) * self.unit_count + return len(self.replacement) def to_range(self) -> PlaceholderRange: return PlaceholderRange( @@ -362,8 +355,8 @@ def to_range(self) -> PlaceholderRange: def find_token_matches( prompt: list[int], - prompt_repls: Sequence[_BoundPromptReplacement[_T]], -) -> list[_PromptReplacementTokenMatch[_T]]: + prompt_repls: Sequence[_BoundPromptReplacement], +) -> list[_PromptReplacementTokenMatch]: """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" return [ _PromptReplacementTokenMatch(prompt_repl, match) @@ -374,8 +367,8 @@ def find_token_matches( def find_text_matches( prompt: str, - prompt_repls: Sequence[_BoundPromptReplacement[_T]], -) -> list[_PromptReplacementTextMatch[_T]]: + prompt_repls: Sequence[_BoundPromptReplacement], +) -> list[_PromptReplacementTextMatch]: """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" return [ _PromptReplacementTextMatch(prompt_repl, match) @@ -385,15 +378,15 @@ def find_text_matches( def _resolve_matches( - prompt: _S, - matches: Sequence[_PromptReplacementMatch[_T, _S]], -) -> list[_PromptReplacementMatch[_T, _S]]: + prompt: _PromptSeq, + matches: Sequence[_PromptReplacementMatch], +) -> list[_PromptReplacementMatch]: """ Resolve :code:`matches` to ensure that there are no overlapping matches, and sort them such that earlier matches take priority over later ones. """ - seen_matches: list[Optional[_PromptReplacementMatch[_T, _S]]] \ - = [None] * len(prompt) + seen_matches: list[Optional[_PromptReplacementMatch]] = [None + ] * len(prompt) for match in matches: for idx in range(match.start_idx, match.end_idx): @@ -409,30 +402,34 @@ def _resolve_matches( def _replace_matches( prompt: _S, - matches: Sequence[_PromptReplacementMatch[_T, _S]], - mm_items_by_modality: Mapping[str, list[_T]], - hf_inputs: BatchFeature, + matches: Sequence[_PromptReplacementMatch], + mm_items: MultiModalDataItems, ) -> list[_S]: out_seqs = list[_S]() prev_end_idx = 0 - next_idx_by_modality = {modality: 0 for modality in mm_items_by_modality} + next_idx_by_modality = {modality: 0 for modality in mm_items} for match in _resolve_matches(prompt, matches): modality = match.modality - mm_items = mm_items_by_modality[modality] + modal_items = mm_items[modality] item_idx = next_idx_by_modality[modality] - if item_idx >= len(mm_items): + if item_idx >= len(modal_items): continue start_idx = match.start_idx end_idx = match.end_idx - repl_unit = match.repl_unit + repl_info = match.prompt_repl - repl_count = repl_info.get_count(mm_items, hf_inputs, item_idx) + replacement = repl_info.get_replacement(item_idx) + + if isinstance(prompt, str): + repl_seq = replacement.text + out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) + else: + repl_seq = replacement.token_ids + out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) - out_seqs.append(prompt[prev_end_idx:start_idx] + - repl_unit * repl_count) prev_end_idx = end_idx next_idx_by_modality[modality] += 1 @@ -443,92 +440,104 @@ def _replace_matches( def replace_token_matches( prompt: list[int], - matches: Sequence[_PromptReplacementMatch[_T, list[int]]], - mm_items_by_modality: Mapping[str, list[_T]], - hf_inputs: BatchFeature, + matches: Sequence[_PromptReplacementTokenMatch], + mm_items: MultiModalDataItems, ) -> list[int]: """Apply :code:`prompt_repls` to :code:`prompt`.""" if not matches: return prompt - token_id_seqs = _replace_matches( - prompt, - matches, - mm_items_by_modality, - hf_inputs, - ) + token_id_seqs = _replace_matches(prompt, matches, mm_items) return flatten_2d_lists(token_id_seqs) def replace_text_matches( prompt: str, - matches: Sequence[_PromptReplacementMatch[_T, str]], - mm_items_by_modality: Mapping[str, list[_T]], - hf_inputs: BatchFeature, + matches: Sequence[_PromptReplacementTextMatch], + mm_items: MultiModalDataItems, ) -> str: """Apply :code:`prompt_repls` to :code:`prompt`.""" if not matches: return prompt - texts = _replace_matches( - prompt, - matches, - mm_items_by_modality, - hf_inputs, - ) + texts = _replace_matches(prompt, matches, mm_items) return "".join(texts) -def _merge_placeholder_matches( - matches: Iterable[_PromptReplacementTokenMatch], -) -> Iterable[_PromptReplacementTokenMatch]: - current_match = None +def _iter_modality_placeholders( + prompt: list[int], + modality: str, + modality_repls: Sequence[_BoundPromptReplacement], + modal_items: list[Any], +) -> Iterable[_PlaceholderInfo]: + if len(modal_items) == 0: + return - for match in sorted(matches, key=lambda x: x.start_idx): - if current_match is None: - current_match = match - elif (current_match.prompt_repl == match.prompt_repl - and current_match.end_idx == match.start_idx): - current_match = _PromptReplacementTokenMatch( - current_match.prompt_repl, - match=_TokenMatch(current_match.start_idx, match.end_idx), - ) - else: - yield current_match - current_match = match + prompt_len = len(prompt) + item_index = 0 + + start_idx = 0 + while start_idx < prompt_len: + found = False + + for repl_info in modality_repls: + replacement = repl_info.get_replacement(item_index) + repl_tokens = replacement.token_ids + repl_len = len(repl_tokens) + end_idx = start_idx + repl_len + + if repl_len == 0 or end_idx > prompt_len: + continue - if current_match is not None: - yield current_match + if prompt[start_idx:end_idx] == repl_tokens: + yield _PlaceholderInfo( + modality=modality, + start_idx=start_idx, + replacement=repl_tokens, + ) + + item_index += 1 + if item_index >= len(modal_items): + return + + # Exclude overlapping matches + start_idx = end_idx + found = True + break + + if not found: + start_idx += 1 def iter_placeholders( - prompt_repls: Sequence[_BoundPromptReplacement[Any]], + prompt_repls: Sequence[_BoundPromptReplacement], prompt: list[int], - *, - min_unit_count: int = 1, + mm_items: MultiModalDataItems, ) -> Iterable[_PlaceholderInfo]: - """Yield each set of placeholder tokens found in :code:`token_ids`.""" - if min_unit_count <= 0: - raise ValueError("`min_unit_count` must be a positive integer") - - matches = (_PromptReplacementTokenMatch(prompt_repl, match) - for prompt_repl in prompt_repls - if len(repl_unit := prompt_repl.repl_unit.token_ids) > 0 - for match in iter_token_matches(prompt, repl_unit)) - - for match in _merge_placeholder_matches(matches): - unit = match.repl_unit - placeholder = _PlaceholderInfo( - modality=match.modality, - start_idx=match.start_idx, - unit=unit, - unit_count=(match.end_idx - match.start_idx) // len(unit), - ) + """ + Yield each set of placeholder tokens found in :code:`prompt`. + + Note that empty matches are ignored. + """ + repls_by_modality = dict(full_groupby_modality(prompt_repls)) + + for modality, modal_items in mm_items.items(): + if modality in repls_by_modality: + yield from _iter_modality_placeholders( + prompt, + modality, + repls_by_modality[modality], + modal_items, + ) + - if placeholder.unit_count >= min_unit_count: - yield placeholder +class ProcessorInputs(NamedTuple): + """Keyword arguments to :meth:`BaseMultiModalProcessor`""" + prompt_text: str + mm_data: MultiModalDataDict + mm_processor_kwargs: Mapping[str, object] class BaseMultiModalProcessor(ABC): @@ -536,52 +545,55 @@ class BaseMultiModalProcessor(ABC): Abstract base class to process multi-modal inputs to be used in vLLM. """ - def __init__( - self, - ctx: InputProcessingContext, - metadata: MultiModalProcessingMetadata, - ) -> None: + def __init__(self, ctx: InputProcessingContext) -> None: super().__init__() self.ctx = ctx - self.metadata = metadata - self.init_mm_processor_kwargs = (ctx.model_config.mm_processor_kwargs - or {}) - def _get_hf_processor( + def __call__( self, - **mm_processor_kwargs: Mapping[str, object], - ) -> ProcessorMixin: - # by default, we won't pass any kwargs to the processor initialization + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + return self.apply(prompt, mm_data, mm_processor_kwargs) + + def _get_hf_processor(self) -> ProcessorMixin: + """ + Subclasses can add keyword arguments to this method to accept + additional kwargs from model config or user inputs. + """ return self.ctx.get_hf_processor() def _get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def __call__( + @abstractmethod + def _get_prompt_replacements( self, - prompt: str, - mm_data: MultiModalDataDict, + mm_items: MultiModalDataItems, + hf_inputs: BatchFeature, mm_processor_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - return self.apply(prompt, mm_data, mm_processor_kwargs) + ) -> list[PromptReplacement]: + """ + Given the original multi-modal items for this modality + and HF-processed data, output the replacements to perform. + + Note: + Even when the HF processor already performs replacement for us, + we still use this replacement information to determine + the placeholder token positions for each multi-modal item. + """ + raise NotImplementedError def _find_placeholders( self, - all_prompt_repls: Sequence[_BoundPromptReplacement[Any]], + all_prompt_repls: Sequence[_BoundPromptReplacement], new_token_ids: list[int], - *, - # To avoid false positives from multi-input when detecting - # whether placeholder tokens have been inserted, in case - # the target sequence is a subset of the replacement tokens - min_unit_count: int = 16, + mm_items: MultiModalDataItems, ) -> list[_PlaceholderInfo]: return list( - iter_placeholders( - all_prompt_repls, - new_token_ids, - min_unit_count=min_unit_count, - )) + iter_placeholders(all_prompt_repls, new_token_ids, mm_items)) def _apply_hf_processor( self, @@ -589,13 +601,7 @@ def _apply_hf_processor( mm_data: MultiModalDataDict, mm_processor_kwargs: Mapping[str, object], ) -> BatchFeature: - # some mm_processor_kwargs may be used in processor initialization - # instead of processor call - processor_init_kwargs = { - **self.init_mm_processor_kwargs, - **mm_processor_kwargs, - } - hf_processor = self._get_hf_processor(**processor_init_kwargs) + hf_processor = self._get_hf_processor(**mm_processor_kwargs) processor_data = dict[str, Any]() passthrough_data = dict[str, Any]() @@ -615,11 +621,10 @@ def _apply_hf_processor( else: processor_data[k] = v - # filter mm_processor_kwargs used in processor call - mm_processor_kwargs = resolve_mm_processor_kwargs( - self.init_mm_processor_kwargs, - cast(Dict[str, Any], mm_processor_kwargs), + assert callable(hf_processor) + mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs( hf_processor, + mm_processor_kwargs, ) try: @@ -642,26 +647,21 @@ def _apply_hf_processor( def _bind_prompt_replacements( self, - mm_data: MultiModalDataDict, - ) -> list[_BoundPromptReplacement[Any]]: + prompt_repls: list[PromptReplacement], + ) -> list[_BoundPromptReplacement]: tokenizer = self._get_tokenizer() - return [ - prompt_repl.bind(modality, tokenizer) - for modality, metadata in self.metadata.items() - if modality in mm_data for prompt_repl in metadata.prompt_repls - ] + return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] def _apply_prompt_replacements( self, - mm_data: MultiModalDataDict, + mm_items: MultiModalDataItems, hf_inputs: BatchFeature, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement[Any]], + prompt_repls: Sequence[_BoundPromptReplacement], ) -> tuple[list[int], str, list[_PlaceholderInfo]]: tokenizer = self._get_tokenizer() - mm_items = to_multi_format(mm_data) token_matches = find_token_matches(token_ids, prompt_repls) # If the search text does not represent a special token, @@ -682,7 +682,6 @@ def _apply_prompt_replacements( token_ids, token_matches, mm_items, - hf_inputs, ) text = _decode(tokenizer, token_ids) @@ -695,13 +694,13 @@ def _apply_prompt_replacements( text, text_matches, mm_items, - hf_inputs, ) token_ids = _encode(tokenizer, text) matched_repls = [match.prompt_repl for match in text_matches] - placeholders = self._find_placeholders(matched_repls, token_ids) + placeholders = self._find_placeholders(matched_repls, token_ids, + mm_items) return token_ids, text, placeholders @@ -731,12 +730,16 @@ def apply( prompt_ids, = hf_inputs.pop("input_ids").tolist() mm_kwargs = MultiModalKwargs(hf_inputs) - all_prompt_repls = self._bind_prompt_replacements(mm_data) + mm_items = to_multi_format(mm_data) + prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs, + mm_processor_kwargs) + all_prompt_repls = self._bind_prompt_replacements(prompt_repls) # If HF processor already inserts placeholder tokens, # there is no need for us to insert them all_placeholders = self._find_placeholders(all_prompt_repls, - prompt_ids) + prompt_ids, mm_items) + if all_placeholders: prompt_text = _decode(tokenizer, prompt_ids) else: @@ -745,7 +748,7 @@ def apply( prompt_text, all_placeholders, ) = self._apply_prompt_replacements( - mm_data, + mm_items, hf_inputs, prompt_ids, all_prompt_repls, @@ -765,13 +768,13 @@ def apply( ) @abstractmethod - def _get_dummy_mm_kwargs( + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], - ) -> MultiModalKwargs: + ) -> ProcessorInputs: """ - Build the input that corresponds to `mm_max_tokens` in - :meth:`get_dummy_data`. + Build the multi-modal portion of the input which, after processing, + results in `mm_max_tokens` in :meth:`get_dummy_data`. """ raise NotImplementedError @@ -784,38 +787,41 @@ def get_dummy_data( # Avoid circular import from vllm.sequence import SequenceData - tokenizer = self._get_tokenizer() - - mm_placeholders = dict[str, _PlaceholderInfo]() - offset = 0 - - for modality, max_tokens in mm_max_tokens.items(): - if max_tokens == 0: - continue - - metadata = self.metadata[modality] - repl = metadata.prompt_repls[0].bind(modality, tokenizer) - repl_token_ids = repl.repl_unit.token_ids - - placeholders = _PlaceholderInfo( - modality=modality, - start_idx=offset, - unit=repl_token_ids, - unit_count=max_tokens // len(repl_token_ids), - ) - - mm_placeholders[modality] = placeholders - offset += placeholders.length + processor_inputs = self._get_dummy_mm_inputs(mm_counts) + mm_inputs = self.apply(*processor_inputs) + + prompt_token_ids = mm_inputs["prompt_token_ids"] + placeholders_by_modality = mm_inputs["mm_placeholders"] + + total_placeholders_by_modality = dict[str, int]() + for modality, placeholders in placeholders_by_modality.items(): + num_placeholders = sum(item["length"] for item in placeholders) + max_tokens = mm_max_tokens[modality] + + if num_placeholders != max_tokens: + logger.warning( + "The processed dummy data has a total of %d placeholder " + "tokens for the '%s' modality, which is not the expected " + "%d tokens.", num_placeholders, modality, max_tokens) + + total_placeholders_by_modality[modality] = num_placeholders + + total_len = len(prompt_token_ids) + if total_len > seq_len: + logger.warning( + "The context length (%d) of the model is too short " + "to hold the multi-modal embeddings in the worst case " + "(%d tokens in total, out of which %s are reserved for " + "multi-modal embeddings). This may cause certain multi-modal " + "inputs to fail during inference, even when the input text is " + "short. To avoid this, you should increase `max_model_len`, " + "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, + total_len, total_placeholders_by_modality) - prompt_token_ids = flatten_2d_lists( - [p.unit * p.unit_count for p in mm_placeholders.values()]) prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) return DummyData( seq_data=SequenceData.from_seqs(prompt_token_ids), - multi_modal_data=self._get_dummy_mm_kwargs(mm_counts), - multi_modal_placeholders={ - modality: [p.to_range()] - for modality, p in mm_placeholders.items() - }, + multi_modal_data=mm_inputs["mm_kwargs"], + multi_modal_placeholders=placeholders_by_modality, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6ab6c0fe2f12e..03f8814a95356 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -299,9 +299,9 @@ def register_processor( """ def wrapper(model_cls: N) -> N: - if model_cls in self._processor_factories: + if self._processor_factories.contains(model_cls, strict=True): logger.warning( - "Model class %s already has an input mapper " + "Model class %s already has a multi-modal processor " "registered to %s. It is overwritten by the new one.", model_cls, self) diff --git a/vllm/utils.py b/vllm/utils.py index fbc3ef7fa7f89..45e682ac15782 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1370,8 +1370,8 @@ def supports_kw( def resolve_mm_processor_kwargs( - init_kwargs: Optional[Dict[str, Any]], - inference_kwargs: Optional[Dict[str, Any]], + init_kwargs: Optional[Mapping[str, object]], + inference_kwargs: Optional[Mapping[str, object]], callable: Callable[..., object], allow_var_kwargs: bool = False, ) -> Dict[str, Any]: @@ -1405,7 +1405,7 @@ def resolve_mm_processor_kwargs( def get_allowed_kwarg_only_overrides( callable: Callable[..., object], - overrides: Optional[Dict[str, Any]], + overrides: Optional[Mapping[str, object]], allow_var_kwargs: bool = False, ) -> Dict[str, Any]: """ @@ -1524,9 +1524,15 @@ def __getitem__(self, key: Type[T]) -> _V: raise KeyError(key) def __contains__(self, key: object) -> bool: + return self.contains(key) + + def contains(self, key: object, *, strict: bool = False) -> bool: if not isinstance(key, type): return False + if strict: + return key in self.data + return any(cls in self.data for cls in key.mro()) From 6d917d0eebd03990edf2443780a5f2506026ea78 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Sat, 14 Dec 2024 17:54:04 +0000 Subject: [PATCH 28/56] Enable mypy checking on V1 code (#11105) Signed-off-by: Mark McLoughlin --- tools/mypy.sh | 1 + vllm/v1/attention/backends/flash_attn.py | 2 ++ vllm/v1/core/kv_cache_manager.py | 10 +++--- vllm/v1/core/kv_cache_utils.py | 17 +++++----- vllm/v1/core/scheduler.py | 1 + vllm/v1/engine/__init__.py | 23 ++++++++----- vllm/v1/engine/async_llm.py | 11 +++--- vllm/v1/engine/core.py | 20 +++++------ vllm/v1/engine/core_client.py | 43 +++++++++++++----------- vllm/v1/engine/detokenizer.py | 4 +-- vllm/v1/engine/llm_engine.py | 3 +- vllm/v1/engine/mm_input_mapper.py | 20 +++++++---- vllm/v1/engine/processor.py | 2 +- vllm/v1/executor/abstract.py | 12 ++----- vllm/v1/executor/multiproc_executor.py | 15 +++++---- vllm/v1/executor/uniproc_executor.py | 7 ++-- vllm/v1/request.py | 3 +- vllm/v1/utils.py | 42 ++++++++++++++--------- vllm/v1/worker/gpu_input_batch.py | 1 + vllm/v1/worker/gpu_model_runner.py | 42 ++++++++++++++--------- vllm/v1/worker/gpu_worker.py | 2 +- 21 files changed, 160 insertions(+), 121 deletions(-) diff --git a/tools/mypy.sh b/tools/mypy.sh index e984e739d70cf..2454ff9fde466 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -29,3 +29,4 @@ run_mypy vllm/plugins run_mypy vllm/prompt_adapter run_mypy vllm/spec_decode run_mypy vllm/worker +run_mypy vllm/v1 diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index c9f04ace644c7..026a0292cc339 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -135,6 +135,8 @@ def forward( assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") + assert output is not None, "Output tensor must be provided." + if attn_metadata is None: # Profiling run. return output diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 8044481a9cd6a..aaa44c930e324 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, List, Optional +from typing import Dict, Iterable, List, Optional from vllm.logger import init_logger from vllm.utils import cdiv @@ -263,12 +263,13 @@ def free(self, request: Request) -> None: """ # Default to [] in case a request is freed (aborted) before alloc. blocks = self.req_to_blocks.pop(request.request_id, []) + ordered_blocks: Iterable[KVCacheBlock] = blocks if self.enable_caching: # Free blocks in reverse order so that the tail blocks are # freed first. - blocks = reversed(blocks) + ordered_blocks = reversed(blocks) - for block in blocks: + for block in ordered_blocks: block.decr_ref() if block.ref_cnt == 0: self.free_block_queue.append(block) @@ -396,8 +397,7 @@ def _cache_full_blocks( f"{request.request_id}({request})") # Compute the hash of the current block. - block_hash = hash_block_tokens(prev_block_hash_value, - tuple(block_tokens)) + block_hash = hash_block_tokens(prev_block_hash_value, block_tokens) # Update and added the full block to the cache. blk.block_hash = block_hash diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 814e462a91fed..0ba338aa5a3d2 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,4 +1,5 @@ """KV-Cache Utilities.""" +from collections.abc import Sequence from dataclasses import dataclass from typing import List, NamedTuple, Optional, Tuple @@ -13,7 +14,7 @@ class BlockHashType(NamedTuple): collision happens when the hash value is the same. """ hash_value: int - token_ids: Tuple[int] + token_ids: Tuple[int, ...] @dataclass @@ -79,8 +80,8 @@ def __init__(self, blocks: List[KVCacheBlock]) -> None: self.num_free_blocks = len(blocks) # Initialize the doubly linked list of free blocks. - self.free_list_head = blocks[0] - self.free_list_tail = blocks[-1] + self.free_list_head: Optional[KVCacheBlock] = blocks[0] + self.free_list_tail: Optional[KVCacheBlock] = blocks[-1] for i in range(self.num_free_blocks): if i > 0: blocks[i].prev_free_block = blocks[i - 1] @@ -159,7 +160,7 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]: def hash_block_tokens(parent_block_hash: Optional[int], - curr_block_token_ids: Tuple[int]) -> BlockHashType: + curr_block_token_ids: Sequence[int]) -> BlockHashType: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. We use LRU cache for this function to avoid recomputing @@ -171,7 +172,7 @@ def hash_block_tokens(parent_block_hash: Optional[int], Args: parent_block_hash: The hash of the parent block. None if this is the first block. - curr_block_token_ids: A tuple of token ids in the current + curr_block_token_ids: A list of token ids in the current block. The current block is assumed to be full. Returns: @@ -179,11 +180,11 @@ def hash_block_tokens(parent_block_hash: Optional[int], The entire tuple is used as the hash key of the block. """ return BlockHashType(hash((parent_block_hash, *curr_block_token_ids)), - curr_block_token_ids) + tuple(curr_block_token_ids)) def hash_request_tokens(block_size: int, - token_ids: List[int]) -> List[BlockHashType]: + token_ids: Sequence[int]) -> List[BlockHashType]: """Computes hash values of a chain of blocks given a sequence of token IDs. The hash value is used for prefix caching. @@ -198,7 +199,7 @@ def hash_request_tokens(block_size: int, parent_block_hash_value = None for start in range(0, len(token_ids), block_size): end = start + block_size - block_token_ids = tuple(token_ids[start:end]) + block_token_ids = token_ids[start:end] # Do not hash the block if it is not full. if len(block_token_ids) < block_size: break diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index f055eed77c372..f76364f64033d 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -152,6 +152,7 @@ def schedule(self) -> "SchedulerOutput": break if not can_schedule: break + assert new_blocks is not None # Schedule the request. scheduled_running_reqs.append(request) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index abeea052c1fa5..cc0c7ea23469a 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -36,7 +36,7 @@ class EngineCoreRequest: prompt: Optional[str] prompt_token_ids: List[int] mm_inputs: Optional[List[Optional[MultiModalKwargs]]] - mm_hashes: Optional[List[Optional[str]]] + mm_hashes: Optional[List[str]] mm_placeholders: Optional[MultiModalPlaceholderDict] sampling_params: SamplingParams eos_token_id: Optional[int] @@ -44,10 +44,11 @@ class EngineCoreRequest: lora_request: Optional[LoRARequest] -class EngineCoreOutput(msgspec.Struct, - array_like=True, - omit_defaults=True, - gc=False): +class EngineCoreOutput( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] request_id: str new_token_ids: List[int] @@ -56,10 +57,11 @@ class EngineCoreOutput(msgspec.Struct, stop_reason: Union[int, str, None] = None -class EngineCoreOutputs(msgspec.Struct, - array_like=True, - omit_defaults=True, - gc=False): +class EngineCoreOutputs( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + gc=False): # type: ignore[call-arg] #NOTE(Nick): We could consider ways to make this more compact, # e.g. columnwise layout and using an int enum for finish/stop reason @@ -81,3 +83,6 @@ class EngineCoreRequestType(enum.Enum): ADD = b'\x00' ABORT = b'\x01' PROFILE = b'\x02' + + +EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 24cafeff63d1e..b36de5f66917c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -81,7 +81,7 @@ def __init__( asyncio_mode=True, ) - self.output_handler = None + self.output_handler: Optional[asyncio.Task] = None def __del__(self): self.shutdown() @@ -126,7 +126,8 @@ def shutdown(self): handler.cancel() @classmethod - def _get_executor_cls(cls, vllm_config: VllmConfig): + def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: + executor_class: Type[Executor] distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) if distributed_executor_backend == "mp": @@ -361,10 +362,10 @@ async def check_health(self) -> None: logger.debug("Called check_health.") async def start_profile(self) -> None: - await self.engine_core.profile(True) + await self.engine_core.profile_async(True) async def stop_profile(self) -> None: - await self.engine_core.profile(False) + await self.engine_core.profile_async(False) @property def is_running(self) -> bool: @@ -380,7 +381,7 @@ def errored(self) -> bool: @property def dead_error(self) -> BaseException: - return Exception + return Exception() # TODO: implement # Retain V0 name for backwards compatibility. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index af644fb5fedba..56d4dc67e4a0e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -5,7 +5,7 @@ import time from dataclasses import dataclass from multiprocessing.process import BaseProcess -from typing import List, Tuple, Type, Union +from typing import List, Tuple, Type import zmq import zmq.asyncio @@ -20,7 +20,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType) + EngineCoreRequestType, EngineCoreRequestUnion) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -97,8 +97,10 @@ def add_request(self, request: EngineCoreRequest): # Note that the cache here is mirrored with the client side of the # MM mapper, so anything that has a hash must have a HIT cache # entry here as well. - request.mm_inputs = self.mm_input_mapper_server.process_inputs( - request.mm_inputs, request.mm_hashes) + assert request.mm_inputs is not None + request.mm_inputs, request.mm_hashes = ( + self.mm_input_mapper_server.process_inputs( + request.mm_inputs, request.mm_hashes)) req = Request.from_engine_core_request(request) @@ -128,7 +130,7 @@ def step(self) -> List[EngineCoreOutput]: def shutdown(self): self.model_executor.shutdown() - def profile(self, is_start=True): + def profile(self, is_start: bool = True): self.model_executor.profile(is_start) @@ -161,8 +163,8 @@ def __init__( # and to overlap some serialization/deserialization with the # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue = queue.Queue() - self.output_queue = queue.Queue() + self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue() + self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue() threading.Thread(target=self.process_input_socket, args=(input_path, ), daemon=True).start() @@ -318,9 +320,7 @@ def _log_stats(self): self._last_logging_time = now - def _handle_client_request( - self, request: Union[EngineCoreRequest, EngineCoreProfile, - List[str]]) -> None: + def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: """Handle EngineCoreRequest or EngineCoreABORT from Client.""" if isinstance(request, EngineCoreRequest): diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e0bfe1b93b360..ff25a9b2e9cac 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,6 +1,6 @@ import atexit import os -from typing import List, Union +from typing import List, Optional import msgspec import zmq @@ -10,8 +10,9 @@ from vllm.utils import get_open_zmq_ipc_path, kill_process_tree from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType) -from vllm.v1.engine.core import EngineCore, EngineCoreProc + EngineCoreRequestType, EngineCoreRequestUnion) +from vllm.v1.engine.core import (EngineCore, EngineCoreProc, + EngineCoreProcHandle) from vllm.v1.serial_utils import PickleEncoder logger = init_logger(__name__) @@ -59,7 +60,7 @@ def get_output(self) -> List[EngineCoreOutput]: def add_request(self, request: EngineCoreRequest) -> None: raise NotImplementedError - async def profile(self, is_start=True) -> None: + def profile(self, is_start: bool = True) -> None: raise NotImplementedError def abort_requests(self, request_ids: List[str]) -> None: @@ -71,6 +72,9 @@ async def get_output_async(self) -> List[EngineCoreOutput]: async def add_request_async(self, request: EngineCoreRequest) -> None: raise NotImplementedError + async def profile_async(self, is_start: bool = True) -> None: + raise NotImplementedError + async def abort_requests_async(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -105,7 +109,7 @@ def shutdown(self): def __del__(self): self.shutdown() - def profile(self, is_start=True) -> None: + def profile(self, is_start: bool = True) -> None: self.engine_core.profile(is_start) @@ -133,7 +137,10 @@ def __init__( self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) # ZMQ setup. - self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context()) + if asyncio_mode: + self.ctx = zmq.asyncio.Context() + else: + self.ctx = zmq.Context() # type: ignore[attr-defined] # Path for IPC. ready_path = get_open_zmq_ipc_path() @@ -149,11 +156,13 @@ def __init__( self.input_socket.bind(input_path) # Start EngineCore in background process. + self.proc_handle: Optional[EngineCoreProcHandle] self.proc_handle = EngineCoreProc.make_engine_core_process( *args, - input_path=input_path, - output_path=output_path, - ready_path=ready_path, + input_path= + input_path, # type: ignore[misc] # MyPy incorrectly flags duplicate keywords + output_path=output_path, # type: ignore[misc] + ready_path=ready_path, # type: ignore[misc] **kwargs, ) atexit.register(self.shutdown) @@ -204,10 +213,8 @@ def get_output(self) -> List[EngineCoreOutput]: engine_core_outputs = self.decoder.decode(frame.buffer).outputs return engine_core_outputs - def _send_input( - self, request_type: EngineCoreRequestType, - request: Union[EngineCoreRequest, EngineCoreProfile, - List[str]]) -> None: + def _send_input(self, request_type: EngineCoreRequestType, + request: EngineCoreRequestUnion) -> None: # (RequestType, SerializedRequest) msg = (request_type.value, self.encoder.encode(request)) @@ -219,7 +226,7 @@ def add_request(self, request: EngineCoreRequest) -> None: def abort_requests(self, request_ids: List[str]) -> None: self._send_input(EngineCoreRequestType.ABORT, request_ids) - def profile(self, is_start=True) -> None: + def profile(self, is_start: bool = True) -> None: self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) @@ -237,10 +244,8 @@ async def get_output_async(self) -> List[EngineCoreOutput]: return engine_core_outputs - async def _send_input( - self, request_type: EngineCoreRequestType, - request: Union[EngineCoreRequest, EngineCoreProfile, - List[str]]) -> None: + async def _send_input(self, request_type: EngineCoreRequestType, + request: EngineCoreRequestUnion) -> None: msg = (request_type.value, self.encoder.encode(request)) await self.input_socket.send_multipart(msg, copy=False) @@ -252,6 +257,6 @@ async def abort_requests_async(self, request_ids: List[str]) -> None: if len(request_ids) > 0: await self._send_input(EngineCoreRequestType.ABORT, request_ids) - async def profile(self, is_start=True) -> None: + async def profile_async(self, is_start: bool = True) -> None: await self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 6249d60199a62..02f34e2b54dd5 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple, Union from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger @@ -97,7 +97,7 @@ def add_tokens( self, new_token_ids: List[int], finish_reason: Optional[str], - stop_reason: Optional[str], + stop_reason: Optional[Union[int, str, None]], ) -> Optional[RequestOutput]: """ Update RequestState for the request_id by: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c02494897b41f..15dedbd0f9529 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -103,7 +103,8 @@ def from_engine_args( multiprocess_mode=enable_multiprocessing) @classmethod - def _get_executor_cls(cls, vllm_config: VllmConfig): + def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: + executor_class: Type[Executor] distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) if distributed_executor_backend == "mp": diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index 58ee29bedb201..cca27c2218af7 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import PIL from blake3 import blake3 @@ -42,14 +42,14 @@ def __init__( model_config) self.mm_registry.init_mm_limits_per_prompt(model_config) - self.mm_cache = LRUDictCache(MM_CACHE_SIZE) + self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE) # DEBUG: Set to None to disable self.mm_debug_cache_hit_ratio_steps = None self.mm_cache_hits = 0 self.mm_cache_total = 0 - def cache_hit_ratio(self, steps) -> float: + def cache_hit_ratio(self, steps): if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0: logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", self.mm_cache_hits / self.mm_cache_total) @@ -60,7 +60,7 @@ def process_inputs( mm_hashes: Optional[List[str]], mm_processor_kwargs: Optional[Dict[str, Any]], precomputed_mm_inputs: Optional[List[MultiModalKwargs]], - ) -> List[MultiModalKwargs]: + ) -> Tuple[List[MultiModalKwargs], Optional[List[str]]]: if precomputed_mm_inputs is None: image_inputs = mm_data["image"] if not isinstance(image_inputs, list): @@ -72,6 +72,7 @@ def process_inputs( # Check if hash is enabled use_hash = mm_hashes is not None if use_hash: + assert mm_hashes is not None assert num_inputs == len( mm_hashes), "num_inputs = {} len(mm_hashes) = {}".format( num_inputs, len(mm_hashes)) @@ -79,7 +80,7 @@ def process_inputs( # Process each image input separately, so that later we can schedule # them in a fine-grained manner. # Apply caching (if enabled) and reuse precomputed inputs (if provided) - ret_hashes = [] if use_hash else None + ret_hashes: Optional[List[str]] = [] if use_hash else None ret_inputs: List[MultiModalKwargs] = [] for input_id in range(num_inputs): if self.mm_debug_cache_hit_ratio_steps is not None: @@ -88,6 +89,7 @@ def process_inputs( mm_hash = None mm_input = None if use_hash: + assert mm_hashes is not None mm_hash = mm_hashes[input_id] mm_input = self.mm_cache.get(mm_hash) @@ -105,12 +107,15 @@ def process_inputs( if use_hash: # Add to cache + assert mm_hash is not None self.mm_cache.put(mm_hash, mm_input) else: self.mm_cache_hits += 1 mm_input = None # Avoids sending mm_input to Server if use_hash: + assert mm_hash is not None + assert ret_hashes is not None ret_hashes.append(mm_hash) ret_inputs.append(mm_input) @@ -120,17 +125,18 @@ def process_inputs( class MMInputMapperServer: def __init__(self, ): - self.mm_cache = LRUDictCache(MM_CACHE_SIZE) + self.mm_cache = LRUDictCache[str, MultiModalKwargs](MM_CACHE_SIZE) def process_inputs( self, mm_inputs: List[Optional[MultiModalKwargs]], - mm_hashes: List[Optional[str]], + mm_hashes: List[str], ) -> List[MultiModalKwargs]: assert len(mm_inputs) == len(mm_hashes) full_mm_inputs = [] for mm_input, mm_hash in zip(mm_inputs, mm_hashes): + assert mm_hash is not None if mm_input is None: mm_input = self.mm_cache.get(mm_hash) assert mm_input is not None diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 903996bad3726..679bf8e25e9ca 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -56,7 +56,7 @@ def process_inputs( request_id: str, prompt: PromptType, params: Union[SamplingParams, PoolingParams], - arrival_time: float, + arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 9cd267581ad18..564d0447f15a6 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, Optional, Tuple +from typing import Tuple from vllm.config import VllmConfig from vllm.v1.outputs import ModelRunnerOutput @@ -28,7 +28,7 @@ def execute_model( raise NotImplementedError @abstractmethod - def profile(self, is_start=True): + def profile(self, is_start: bool = True): raise NotImplementedError @abstractmethod @@ -38,11 +38,3 @@ def shutdown(self): @abstractmethod def check_health(self) -> None: raise NotImplementedError - - @abstractmethod - def collective_rpc(self, - method: str, - timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> []: - raise NotImplementedError diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 14384a730ceec..17441dacdc5cf 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from enum import Enum, auto from multiprocessing.process import BaseProcess -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import zmq @@ -21,6 +21,7 @@ from vllm.logger import init_logger from vllm.utils import (get_distributed_init_method, get_open_port, get_open_zmq_ipc_path) +from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.utils import make_zmq_socket from vllm.worker.worker_base import WorkerWrapperBase @@ -31,7 +32,7 @@ POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -class MultiprocExecutor: +class MultiprocExecutor(Executor): def __init__(self, vllm_config: VllmConfig) -> None: # Call self.shutdown at exit to clean up @@ -103,7 +104,7 @@ def collective_rpc(self, method: str, timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> []: + kwargs: Optional[Dict] = None) -> List[Any]: """ Execute an RPC call on workers. @@ -125,7 +126,7 @@ def collective_rpc(self, responses = [None] * self.world_size for w in self.workers: - dequeue_timeout = timeout - (time.monotonic() - start_time() + dequeue_timeout = timeout - (time.monotonic() - start_time ) if timeout is not None else None status, result = w.worker_response_mq.dequeue( timeout=dequeue_timeout) @@ -153,7 +154,7 @@ def execute_model( args=(scheduler_output, ))[0] return model_output - def profile(self, is_start=True): + def profile(self, is_start: bool = True): self.collective_rpc("profile", args=(is_start, )) return @@ -185,7 +186,6 @@ def wait_for_termination(procs, timeout): p.kill() self._cleanup_sockets() - self.workers = None def _cleanup_sockets(self): for w in self.workers: @@ -200,7 +200,8 @@ def shutdown(self): # again atexit.unregister(self.shutdown) """Properly shut down the executor and its workers""" - if (hasattr(self, 'workers') and self.workers is not None): + if getattr(self, 'shutting_down', False): + self.shutting_down = True for w in self.workers: #TODO: not sure if needed w.worker_response_mq = None self._ensure_worker_termination() diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index 9b1d9a40950c6..be058318de58b 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -4,13 +4,14 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_worker import Worker logger = init_logger(__name__) -class UniprocExecutor: +class UniprocExecutor(Executor): def __init__(self, vllm_config: VllmConfig) -> None: self.vllm_config = vllm_config @@ -25,7 +26,7 @@ def __init__(self, vllm_config: VllmConfig) -> None: self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config - self.worker = self._create_worker() + self.worker: Worker = self._create_worker() self.worker.initialize() self.worker.load_model() @@ -75,7 +76,7 @@ def profile(self, is_start: bool = True): self.worker.profile(is_start) def shutdown(self): - self.worker = None + pass def check_health(self) -> None: # UniprocExecutor will always be healthy as long as diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 6bc1e4d5c769f..1737d096e811d 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -52,10 +52,9 @@ def __init__( else: self.mm_positions = [] # Output of the mm input mapper (e.g., image tensors). + self.mm_inputs: List[MultiModalKwargs] = [] if self.inputs.multi_modal_inputs: self.mm_inputs = self.inputs.multi_modal_inputs - else: - self.mm_inputs: List[MultiModalKwargs] = [] @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 6ecf20e717ca3..5f327d7066830 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,6 +1,8 @@ from collections import OrderedDict +from collections.abc import Sequence from contextlib import contextmanager -from typing import Any, Generic, Iterator, List, TypeVar, overload +from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union, + overload) import zmq @@ -11,7 +13,7 @@ T = TypeVar("T") -class ConstantList(Generic[T]): +class ConstantList(Generic[T], Sequence): def __init__(self, x: List[T]) -> None: self._x = x @@ -34,29 +36,33 @@ def remove(self, item): def clear(self): raise Exception("Cannot clear a constant list") - def index(self, item): - return self._x.index(item) + def index(self, + item: T, + start: int = 0, + stop: Optional[int] = None) -> int: + return self._x.index(item, start, + stop if stop is not None else len(self._x)) @overload - def __getitem__(self, item) -> T: + def __getitem__(self, item: int) -> T: ... @overload def __getitem__(self, s: slice, /) -> List[T]: ... - def __getitem__(self, item): + def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]: return self._x[item] @overload - def __setitem__(self, item, value): + def __setitem__(self, item: int, value: T): ... @overload - def __setitem__(self, s: slice, value, /): + def __setitem__(self, s: slice, value: T, /): ... - def __setitem__(self, item, value): + def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]): raise Exception("Cannot set item in a constant list") def __delitem__(self, item): @@ -73,10 +79,12 @@ def __len__(self): @contextmanager -def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]: +def make_zmq_socket( + path: str, + type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] """Context manager for a ZMQ socket""" - ctx = zmq.Context() + ctx = zmq.Context() # type: ignore[attr-defined] try: socket = ctx.socket(type) @@ -96,20 +104,24 @@ def make_zmq_socket(path: str, type: Any) -> Iterator[zmq.Socket]: ctx.destroy(linger=0) -class LRUDictCache: +K = TypeVar('K') +V = TypeVar('V') + + +class LRUDictCache(Generic[K, V]): def __init__(self, size: int): - self.cache = OrderedDict() + self.cache: OrderedDict[K, V] = OrderedDict() self.size = size - def get(self, key, default=None): + def get(self, key: K, default=None) -> V: if key not in self.cache: return default self.cache.move_to_end(key) return self.cache[key] - def put(self, key, value): + def put(self, key: K, value: V): self.cache[key] = value self.cache.move_to_end(key) if len(self.cache) > self.size: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 9046b37f60005..5c113c74778df 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -215,6 +215,7 @@ def condense(self, empty_req_indices: List[int]) -> None: # Swap the states. req_id = self.req_ids[last_req_index] + assert req_id is not None self.req_ids[empty_index] = req_id self.req_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f24942068d1f8..abcd4b007a326 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,6 @@ import gc import time -from typing import TYPE_CHECKING, Dict, List, Tuple +from typing import TYPE_CHECKING, Dict, List, Tuple, cast import numpy as np import torch @@ -193,9 +193,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_ids_to_add: List[str] = [] # Add new requests to the cached states. - for req_data in scheduler_output.scheduled_new_reqs: - req_id = req_data.req_id - sampling_params = req_data.sampling_params + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + sampling_params = new_req_data.sampling_params if sampling_params.sampling_type == SamplingType.RANDOM_SEED: generator = torch.Generator(device=self.device) generator.manual_seed(sampling_params.seed) @@ -204,25 +204,25 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.requests[req_id] = CachedRequestState( req_id=req_id, - prompt_token_ids=req_data.prompt_token_ids, - prompt=req_data.prompt, - mm_inputs=req_data.mm_inputs, - mm_positions=req_data.mm_positions, + prompt_token_ids=new_req_data.prompt_token_ids, + prompt=new_req_data.prompt, + mm_inputs=new_req_data.mm_inputs, + mm_positions=new_req_data.mm_positions, sampling_params=sampling_params, generator=generator, - block_ids=req_data.block_ids, - num_computed_tokens=req_data.num_computed_tokens, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, output_token_ids=[], ) req_ids_to_add.append(req_id) # Update the cached states of the resumed requests. - for req_data in scheduler_output.scheduled_resumed_reqs: - req_id = req_data.req_id + for res_req_data in scheduler_output.scheduled_resumed_reqs: + req_id = res_req_data.req_id req_state = self.requests[req_id] - req_state.block_ids = req_data.block_ids - req_state.num_computed_tokens = req_data.num_computed_tokens + req_state.block_ids = res_req_data.block_ids + req_state.num_computed_tokens = res_req_data.num_computed_tokens req_ids_to_add.append(req_id) # Add the new or resumed requests to the persistent batch. @@ -259,6 +259,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): num_scheduled_tokens = [] max_num_scheduled_tokens = 0 for req_id in self.input_batch.req_ids[:num_reqs]: + assert req_id is not None num_tokens = scheduler_output.num_scheduled_tokens[req_id] num_scheduled_tokens.append(num_tokens) max_num_scheduled_tokens = max(max_num_scheduled_tokens, @@ -373,7 +374,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # Batch the multi-modal inputs. mm_inputs: List[MultiModalKwargs] = [] - req_input_ids: List[Tuple[int, int]] = [] + req_input_ids: List[Tuple[str, int]] = [] for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] for input_id in encoder_input_ids: @@ -406,6 +407,7 @@ def _gather_encoder_outputs( encoder_outputs: List[torch.Tensor] = [] num_reqs = self.input_batch.num_reqs for req_id in self.input_batch.req_ids[:num_reqs]: + assert req_id is not None num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ req_id] req_state = self.requests[req_id] @@ -514,6 +516,7 @@ def execute_model( # the requests one by one. Optimize. num_reqs = self.input_batch.num_reqs for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None req_state = self.requests[req_id] seq_len = (req_state.num_computed_tokens + scheduler_output.num_scheduled_tokens[req_id]) @@ -539,8 +542,15 @@ def execute_model( logprobs = None else: logprobs = sampler_output.logprobs.cpu() + + # num_reqs entries should be non-None + assert all( + req_id is not None for req_id in + self.input_batch.req_ids[:num_reqs]), "req_ids contains None" + req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) + model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids[:num_reqs], + req_ids=req_ids, req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=sampled_token_ids, logprob_token_ids_cpu=logprob_token_ids, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 49e415ab72e0b..33491f700de10 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -204,7 +204,7 @@ def execute_model( return output if self.rank == 0 else None return output - def profile(self, is_start=True): + def profile(self, is_start: bool = True): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") if is_start: From 886936837ca89e5645bc1f71cc0e1492b65b1590 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sun, 15 Dec 2024 04:38:10 +0900 Subject: [PATCH 29/56] [Performance][Core] Optimize the performance of evictor v1 and v2 by applying a priority queue and lazy deletion (#7209) --- vllm/core/evictor.py | 63 ++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index ed7e06cab2996..44adc4158abec 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,6 +1,7 @@ import enum +import heapq from abc import ABC, abstractmethod -from typing import OrderedDict, Tuple +from typing import Dict, List, Tuple class EvictionPolicy(enum.Enum): @@ -75,8 +76,14 @@ class LRUEvictor(Evictor): highest num_hashed_tokens value, then one will be chose arbitrarily """ + # CLEANUP_THRESHOLD determines the maximum allowable size of the priority + # queue relative to the free table size. When this threshold is exceeded, + # a cleanup operation is triggered to reduce memory usage. + CLEANUP_THRESHOLD = 50 + def __init__(self): - self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict() + self.free_table: Dict[int, BlockMetaData] = {} + self.priority_queue = [] def __contains__(self, block_id: int) -> bool: return block_id in self.free_table @@ -85,34 +92,50 @@ def evict(self) -> Tuple[int, int]: if len(self.free_table) == 0: raise ValueError("No usable cache memory left") - evicted_block, evicted_block_id = None, None - # The blocks with the lowest timestamps should be placed consecutively - # at the start of OrderedDict. Loop through all these blocks to - # find the one with maximum number of hashed tokens. - for _id, block in self.free_table.items(): - if evicted_block is None: - evicted_block, evicted_block_id = block, _id - continue - if evicted_block.last_accessed < block.last_accessed: - break - if evicted_block.num_hashed_tokens < block.num_hashed_tokens: - evicted_block, evicted_block_id = block, _id - - assert evicted_block is not None - assert evicted_block_id is not None - self.free_table.pop(evicted_block_id) - - return evicted_block_id, evicted_block.content_hash + while self.priority_queue: + # We do not remove outdated entries from the priority queue at the + # time of updating the last_accessed timestamp. Instead, outdated + # entries are filtered out here during eviction. Outdated entries + # would either not in the free table, or have older last accessed + # time. + last_accessed, _, block_id, content_hash = heapq.heappop( + self.priority_queue) + if (block_id in self.free_table and + self.free_table[block_id].last_accessed == last_accessed): + self.free_table.pop(block_id) + return block_id, content_hash + + raise ValueError("No usable cache memory left") def add(self, block_id: int, content_hash: int, num_hashed_tokens: int, last_accessed: float): self.free_table[block_id] = BlockMetaData(content_hash, num_hashed_tokens, last_accessed) + heapq.heappush( + self.priority_queue, + (last_accessed, -num_hashed_tokens, block_id, content_hash)) + self._cleanup_if_necessary() def update(self, block_id: int, last_accessed: float): self.free_table[block_id].last_accessed = last_accessed + def _cleanup_if_necessary(self): + if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len( + self.free_table): + self._cleanup() + + def _cleanup(self): + new_priority_queue: List[Tuple[float, int, int, int]] = [] + + for block_id, block in self.free_table.items(): + new_priority_queue.append( + (block.last_accessed, -block.num_hashed_tokens, block_id, + block.content_hash)) + heapq.heapify(new_priority_queue) + + self.priority_queue = new_priority_queue + def remove(self, block_id: int): if block_id not in self.free_table: raise ValueError( From 15859f2357059ef488405e5336d2c6e5d246687b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 15 Dec 2024 11:03:06 +0800 Subject: [PATCH 30/56] [[Misc]Upgrade bitsandbytes to the latest version 0.45.0 (#11201) --- Dockerfile | 2 +- docs/source/quantization/bnb.rst | 2 +- requirements-test.in | 2 +- requirements-test.txt | 2 +- vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 682f046d4b6ec..c1b6e1bbfe354 100644 --- a/Dockerfile +++ b/Dockerfile @@ -218,7 +218,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10 + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10 ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst index 682938cc63d48..84f805bb60c2a 100644 --- a/docs/source/quantization/bnb.rst +++ b/docs/source/quantization/bnb.rst @@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM. .. code-block:: console - $ pip install bitsandbytes>=0.44.0 + $ pip install bitsandbytes>=0.45.0 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. diff --git a/requirements-test.in b/requirements-test.in index 57fddb416317e..fb4179c3d8423 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -25,7 +25,7 @@ datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test # quantization -bitsandbytes>=0.44.0 +bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index c786a1249bddb..3771577fe8ed0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -37,7 +37,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements-test.in -bitsandbytes==0.44.1 +bitsandbytes>=0.45.0 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index e01c713dd14db..5dc872933282c 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -145,12 +145,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.44.0": + if bitsandbytes.__version__ < "0.45.0": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.44.0.") + "install bitsandbytes>=0.45.0.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.44.0 via " - "`pip install bitsandbytes>=0.44.0` to use " + raise ImportError("Please install bitsandbytes>=0.45.0 via " + "`pip install bitsandbytes>=0.45.0` to use " "bitsandbytes quantizer.") from err self.quant_config = quant_config From a1c02058baf47be1a91ee743378a340ee1b10416 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 14 Dec 2024 19:45:00 -0800 Subject: [PATCH 31/56] [torch.compile] allow tracking forward time (#11081) Signed-off-by: youkaichao --- vllm/forward_context.py | 61 ++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index cd136f43c0c57..7f56575279e9b 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,9 +1,11 @@ import time -from collections import Counter +from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from typing import Any, Dict, Optional +import torch + import vllm.envs as envs from vllm.config import VllmConfig from vllm.logger import init_logger @@ -11,9 +13,10 @@ logger = init_logger(__name__) track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0 -batchsize_counter: Counter = Counter() last_logging_time: float = 0 +forward_start_time: float = 0 batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL +batchsize_forward_time: defaultdict = defaultdict(list) @dataclass @@ -40,23 +43,10 @@ def set_forward_context(context: Any, vllm_config: VllmConfig): can be attention metadata, etc. Here we can inject common logic for every model forward pass. """ - global track_batchsize, batchsize_counter - global last_logging_time, batchsize_logging_interval - if track_batchsize and context is not None: - if hasattr(context, "num_prefill_tokens"): - # for v0 attention backends - batchsize = context.num_prefill_tokens + context.num_decode_tokens - else: - # for v1 attention backends - batchsize = context.num_input_tokens - batchsize_counter[batchsize] += 1 - if time.monotonic() - last_logging_time > batchsize_logging_interval: - last_logging_time = time.monotonic() - sorted_data = sorted(batchsize_counter.items(), - key=lambda x: x[1], - reverse=True) - logger.info("Batchsize distribution (batchsize, count): %s", - sorted_data) + global forward_start_time + need_to_track_batchsize = track_batchsize and context is not None + if need_to_track_batchsize: + forward_start_time = time.perf_counter() global _forward_context prev_context = _forward_context _forward_context = ForwardContext( @@ -66,4 +56,37 @@ def set_forward_context(context: Any, vllm_config: VllmConfig): try: yield finally: + global batchsize_counter + global last_logging_time, batchsize_logging_interval + if need_to_track_batchsize: + if hasattr(context, "num_prefill_tokens"): + # for v0 attention backends + batchsize = context.num_prefill_tokens + \ + context.num_decode_tokens + else: + # for v1 attention backends + batchsize = context.num_input_tokens + # we use synchronous scheduling right now, + # adding a sync point here should not affect + # scheduling of the next batch + torch.cuda.synchronize() + now = time.perf_counter() + # time measurement is in milliseconds + batchsize_forward_time[batchsize].append( + (now - forward_start_time) * 1000) + if now - last_logging_time > batchsize_logging_interval: + last_logging_time = now + forward_stats = [] + for bs, times in batchsize_forward_time.items(): + if len(times) <= 1: + # can be cudagraph / profiling run + continue + medium = torch.quantile(torch.tensor(times), q=0.5).item() + medium = round(medium, 2) + forward_stats.append((bs, len(times), medium)) + forward_stats.sort(key=lambda x: x[1], reverse=True) + if forward_stats: + logger.info(("Batchsize forward time stats " + "(batchsize, count, median_time(ms)): %s"), + forward_stats) _forward_context = prev_context From b10609e6a11554be61976981304984510a0469c9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 15 Dec 2024 14:30:28 +0800 Subject: [PATCH 32/56] [Misc] Clean up multi-modal processor (#11207) Signed-off-by: DarkLight1337 --- examples/offline_inference_vision_language.py | 5 +- tests/multimodal/test_processing.py | 17 ++++--- vllm/multimodal/processing.py | 48 +++++++++---------- 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 45539c665a922..7bc43242b717e 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -92,10 +92,7 @@ def run_fuyu(question: str, modality: str): def run_phi3v(question: str, modality: str): assert modality == "image" - prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501 - # Note: The default setting of max_num_seqs (256) and - # max_model_len (128k) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. + prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # num_crops is an override kwarg to the multimodal image processor; # For some models, e.g., Phi-3.5-vision-instruct, it is recommended diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 6aaa80ddc9fa5..d22d778f81fa8 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -2,10 +2,9 @@ import pytest -from vllm.multimodal.processing import (MultiModalDataItems, PromptReplacement, - _PlaceholderInfo, find_text_matches, - find_token_matches, iter_placeholders, - iter_token_matches, +from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo, + find_text_matches, find_token_matches, + iter_placeholders, iter_token_matches, replace_text_matches, replace_token_matches) from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -314,8 +313,8 @@ def test_find_replace_text( result = replace_text_matches( prompt, matches, - MultiModalDataItems({key: [None] * mm_count - for key in repl_by_key}), + {key: mm_count + for key in repl_by_key}, ) # Only displayed on error @@ -380,8 +379,8 @@ def test_find_replace_tokens( result = replace_token_matches( prompt, matches, - MultiModalDataItems({key: [None] * mm_count - for key in repl_by_key}), + {key: mm_count + for key in repl_by_key}, ) # Only displayed on error @@ -476,7 +475,7 @@ def test_iter_placeholders( prompt_repls, prompt, # Effectively match all occurrences in the prompt - MultiModalDataItems({key: [None] * 3 for key in repl_by_key}), + {key: 3 for key in repl_by_key}, )) # Only displayed on error diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index de5a002d474c2..ce6bec1d49aac 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -403,18 +403,17 @@ def _resolve_matches( def _replace_matches( prompt: _S, matches: Sequence[_PromptReplacementMatch], - mm_items: MultiModalDataItems, + mm_item_counts: Mapping[str, int], ) -> list[_S]: out_seqs = list[_S]() prev_end_idx = 0 - next_idx_by_modality = {modality: 0 for modality in mm_items} + next_idx_by_modality = {modality: 0 for modality in mm_item_counts} for match in _resolve_matches(prompt, matches): modality = match.modality - modal_items = mm_items[modality] item_idx = next_idx_by_modality[modality] - if item_idx >= len(modal_items): + if item_idx >= mm_item_counts[modality]: continue start_idx = match.start_idx @@ -441,13 +440,13 @@ def _replace_matches( def replace_token_matches( prompt: list[int], matches: Sequence[_PromptReplacementTokenMatch], - mm_items: MultiModalDataItems, + mm_item_counts: Mapping[str, int], ) -> list[int]: """Apply :code:`prompt_repls` to :code:`prompt`.""" if not matches: return prompt - token_id_seqs = _replace_matches(prompt, matches, mm_items) + token_id_seqs = _replace_matches(prompt, matches, mm_item_counts) return flatten_2d_lists(token_id_seqs) @@ -455,13 +454,13 @@ def replace_token_matches( def replace_text_matches( prompt: str, matches: Sequence[_PromptReplacementTextMatch], - mm_items: MultiModalDataItems, + mm_item_counts: Mapping[str, int], ) -> str: """Apply :code:`prompt_repls` to :code:`prompt`.""" if not matches: return prompt - texts = _replace_matches(prompt, matches, mm_items) + texts = _replace_matches(prompt, matches, mm_item_counts) return "".join(texts) @@ -470,9 +469,9 @@ def _iter_modality_placeholders( prompt: list[int], modality: str, modality_repls: Sequence[_BoundPromptReplacement], - modal_items: list[Any], + modal_item_count: int, ) -> Iterable[_PlaceholderInfo]: - if len(modal_items) == 0: + if modal_item_count == 0: return prompt_len = len(prompt) @@ -499,7 +498,7 @@ def _iter_modality_placeholders( ) item_index += 1 - if item_index >= len(modal_items): + if item_index >= modal_item_count: return # Exclude overlapping matches @@ -514,7 +513,7 @@ def _iter_modality_placeholders( def iter_placeholders( prompt_repls: Sequence[_BoundPromptReplacement], prompt: list[int], - mm_items: MultiModalDataItems, + mm_item_counts: Mapping[str, int], ) -> Iterable[_PlaceholderInfo]: """ Yield each set of placeholder tokens found in :code:`prompt`. @@ -523,13 +522,13 @@ def iter_placeholders( """ repls_by_modality = dict(full_groupby_modality(prompt_repls)) - for modality, modal_items in mm_items.items(): + for modality, modal_item_count in mm_item_counts.items(): if modality in repls_by_modality: yield from _iter_modality_placeholders( prompt, modality, repls_by_modality[modality], - modal_items, + modal_item_count, ) @@ -590,10 +589,10 @@ def _find_placeholders( self, all_prompt_repls: Sequence[_BoundPromptReplacement], new_token_ids: list[int], - mm_items: MultiModalDataItems, + mm_item_counts: Mapping[str, int], ) -> list[_PlaceholderInfo]: return list( - iter_placeholders(all_prompt_repls, new_token_ids, mm_items)) + iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) def _apply_hf_processor( self, @@ -655,10 +654,9 @@ def _bind_prompt_replacements( def _apply_prompt_replacements( self, - mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, token_ids: list[int], prompt_repls: Sequence[_BoundPromptReplacement], + mm_item_counts: Mapping[str, int], ) -> tuple[list[int], str, list[_PlaceholderInfo]]: tokenizer = self._get_tokenizer() @@ -675,13 +673,13 @@ def _apply_prompt_replacements( # of the search text in the prompt, we instead perform string # replacement on the decoded token IDs, then encode them back. if all( - len(matches) >= len(mm_items[modality]) + len(matches) >= mm_item_counts[modality] for modality, matches in full_groupby_modality(token_matches) ): # yapf: disable token_ids = replace_token_matches( token_ids, token_matches, - mm_items, + mm_item_counts, ) text = _decode(tokenizer, token_ids) @@ -693,14 +691,14 @@ def _apply_prompt_replacements( text = replace_text_matches( text, text_matches, - mm_items, + mm_item_counts, ) token_ids = _encode(tokenizer, text) matched_repls = [match.prompt_repl for match in text_matches] placeholders = self._find_placeholders(matched_repls, token_ids, - mm_items) + mm_item_counts) return token_ids, text, placeholders @@ -737,8 +735,9 @@ def apply( # If HF processor already inserts placeholder tokens, # there is no need for us to insert them + mm_item_counts = {m: len(items) for m, items in mm_items.items()} all_placeholders = self._find_placeholders(all_prompt_repls, - prompt_ids, mm_items) + prompt_ids, mm_item_counts) if all_placeholders: prompt_text = _decode(tokenizer, prompt_ids) @@ -748,10 +747,9 @@ def apply( prompt_text, all_placeholders, ) = self._apply_prompt_replacements( - mm_items, - hf_inputs, prompt_ids, all_prompt_repls, + mm_item_counts, ) mm_placeholders = { From 96d673e0f897aa8eec234e690c9c5425782d6ffb Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 16 Dec 2024 01:59:42 +0800 Subject: [PATCH 33/56] [Bugfix] Fix error handling of unsupported sliding window (#11213) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/llama.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 733b1bc7d80ac..2902e6999c2fd 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -169,13 +169,15 @@ def __init__( ) if hasattr(config, "interleaved_sliding_window"): - if isinstance(config.interleaved_sliding_window, int): - sliding_window = config.interleaved_sliding_window - elif isinstance(config.interleaved_sliding_window, list): - sw_idx = layer_idx % len(config.interleaved_sliding_window) - sliding_window = config.interleaved_sliding_window[sw_idx] + interleaved_sliding_window = config.interleaved_sliding_window + if isinstance(interleaved_sliding_window, int): + sliding_window = interleaved_sliding_window + elif isinstance(interleaved_sliding_window, list): + sw_idx = layer_idx % len(interleaved_sliding_window) + sliding_window = interleaved_sliding_window[sw_idx] else: - raise ValueError(f"{type(sliding_window)} is not supported.") + raise ValueError( + f"{type(interleaved_sliding_window)} is not supported.") else: sliding_window = None From 38e599d6a84bb7477030a5488035cd23f529b644 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 15 Dec 2024 13:31:16 -0600 Subject: [PATCH 34/56] [Doc] add documentation for disaggregated prefilling (#11197) Signed-off-by: Kuntai Du --- .../usage/disagg_prefill/abstraction.jpg | Bin 0 -> 104673 bytes .../assets/usage/disagg_prefill/overview.jpg | Bin 0 -> 177439 bytes docs/source/index.rst | 1 + docs/source/usage/disagg_prefill.rst | 69 ++++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100644 docs/source/assets/usage/disagg_prefill/abstraction.jpg create mode 100644 docs/source/assets/usage/disagg_prefill/overview.jpg create mode 100644 docs/source/usage/disagg_prefill.rst diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/usage/disagg_prefill/abstraction.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1a99e3ed8cf5f3b6679196752896fca94a22a4a4 GIT binary patch literal 104673 zcmeFa2UHW^x+p&MF4B<}DbkB{Q6wVLLF$`|JHksOy`p=+sUYz$s4U4z}-146t4Lji!Y!X|I5rw22?WM*J|K_8$3v11c* z2@3L~J`4c<0inT{49_05vv)Yk@C9H2Sb@WUJRs=e8WN;yZhqlctAAd9!v8pMAAimh z0E{dAZ0jH8|0RgS%{{~woMm&c_EonKSAP(01W%l6P%t=VnkukdJR&scC;S0~xr4z0 zg7BxGu;(8z{wI9(7fj3gtB%zrJ#d1Lf{V%O;u7o$0Ic$0`O$D!Pp}=<3m|+vz|A`V zgyA5p$0aq`Y{Z;M_V)qy96=r1p7wj6M z2hQam_qw^9Gx-Y+@VoePj()Z2;cf7fkbc45p~hB!!R{gYmVd#)ftUX7%OhA%_pfr7 zU=aHA-Y{S5zstScjsI@{x|fyhUv7V_2c^m(IZ>X1*(O>1RF8>(6dw}jg+6=M#du(p*=k))o?;c?NH@-ok zXaDXi$nV$Q_%%Q7eujUShlE@HeQ#*6)!*eoq2_Vmt?{itz(tD^i-mE$UE0Pu6~ z{;UH4-z3_Js>fqI|?=m5Haeqadr089e2z!Iv!nXQ<8m~SyZWUgWEW1eIF#=^oP#-hby z!Q#zwljR}H3zj!5%Pa@1oUF2}daRdO!&uW;%UNHu&ai%CV`GzKJHzI{7Rr{!_MEMU zZGjDci0hEzA>%`ChY*MI4mBSdJM@*EiTxP6F1s`Pb@pub2KEtl^kJsMl85yUyBv-= zoOihO@YLb&99$gA9A+H;9H|@?9D^KRI2kx4IrTZ+IO8~rIeR!)xoEh=x%9YPxo&fn zaP@Jmano~4aT{~{aHnurbHC@t@Nn{|@!0T0^5pTn;#uOQ;g#ey=Jn%E=dI`c$oqp& zm`|6_oiB;6hHs2-pI?yw48J>nGJh@qB>#^iM~~&+5a|@z zfN(){Al{G%5F}*tD9=&dqkcyp9_>Af5fu_O5)BhA6n!sB5R($K7K;_D6`L1l7C#~G zA^t$TTO1<+kua5jOH@jrjxilOam?#j&ar`G`;trIis^qCm zsIsXVs3KHfs$$jT)ZEpcs7CLy{6y-B zH(GRBXSEPo9a;w`HBN?}tUtMPO5v36sftryw57D&wM(>DPm7&)IbC>qN$04}6`iL# zOHfg$3$zHjaz^}&+nKU6>$)3!3=i^nboTx>A|%#6+Mo6Va`nO`$+x1hH$x5%^jd`bBd{L+9WmnF=y z!V+h7)+*g<)>_&+%(~l#-R82*bDMqJbGDhbOLj_jH|&P&1?;`;Upg>2*f^9qe0S7$ z%yL}2tbRG+@<%5rr%0zkX8~tF=T|UxmCqd{^(2|-KOPF~Bph6y$aE(@UwaSC}E$_*+>@4}?R z;=`81b;9$)@ewu=4Urs?0g>;bWTTRz)~_2}FO8;+c8%_ai^6Zgmv89aD2$$K8xujW>+1OgNMfoG_JmDzV@W?H%ts z?~_!M9wm{J-ICv?98bwkA*Q;fzP+n__u*Yinn&9EbdB^U_vr5h-20dT&3Kl{k$F9H z^}gx-<_AX~BtO{8f@KY6t7RADFz1BjEIqvN@Z}?kNB15PbG>pW^UmhgJ{EeM^msqt zEr0CEnI|;`!UZV>Kc0F&oh~#iY%Y>4$}Xla4ln*(VpB3udZP4snLt@e8L2#=eCgSx zXT8rgpFghpa)R7=TZS}-n%}9v zYZ_J`ZhEirzIjA*!&mW%8UJzO+T9jF=UD8;3wR~>*-HPSP?5gYP)~Aq9#I?B3Y@f5& zA?we-sC?A1PJ6|_Y_Cv0k%9P-BtvQ=n~>)y{uD}(OR&o?UErr>MC}3E zHoN7Z{a_COT%cag?D&Un{j0_=yXj8|R7-zBe_a0q{D=Pi>o*MmsNn_x3pA+ROaZ`) z5&)0{+h@NF0Oo7}V5J36tE&Ee{@L6gIy(TkxCHrmo4dH_3@iSj<^K8mUzb1hdGOm` ztOnM zfI{`pgX-s9G&FSJ9}6Qr<6p)9^($o_wDA%tuK^B5s%FNkG*n^$H3t<92Nk84stBA@ zx<5wv7qZ|uY3b-07@3$^4g=IwG&Iz-G<0;dp!P;}Dn#)pCeSZS!h1*72rpa7XHU6c&`K7Y@_zoG%khu`$^ zujb`<*&q9bGWwf7e$&GLns-vh{-6)J6nw@i1z;gc;Uj9{me>vo;64ghFL&~3`R1zQ zRQ}~HjXX|LowIFD8OJANzvkW?ca0qn@)c3hTk0GH`n|v8-B~n-&(dPG9{7Y<=@TY2|O2gMA&&CDCCeND|voZBVVean!gKTmGcVTIdc z0tb+{$=vvL%%;st3PejzCjfzFL|r2wWFbA~V9xh>AnfhUAA{%81MIS?-kAr?b%Nfe;o zb!CU7etBwM9n(VOVJBM@5YAD68D(q)Ik(1(W?2b;l$}`Evjicf3{!woRa>-rj5Zhu z01#`3AOHnGI|q_va*;02fTCpS-{=3^b3n7ZNM`W`NidUC2t^xDP=GzB=qVz{I@v;x zpaGJV8fJi;w^U09Mzv62R7(QsWk%o(gOd21zk^#Q3M6N9@-Px-s;x``%J9-Od)Jet zpHhP-J7wZG@&22Lt2Ubg_)!4V>~bRd6hSjG2bYC!M28Mj)9qGYg(6?IKfJTldMtBX~Q`Mtt);*&DjF`qGbRUmi0IWrRD_wZy}Xu{wQW=DXYn+>F;^HT-vLB!@KT zSB1DP%6{#R=8NHuv?Q|(65x31fz^d3tffBb@t$b}N!185nUKg^7=e>Ir?ER{9OP{6 z&66bUcIgn)T@@avHR6iz4qo~~%wa4f4zX0ody-~LfVUXu1abaQ>Xf_oWWIlGdy1Ns zlS9{Tdg-$|pi267IX#`|S_(gj5oy8FIJu_{oPE(cp?Ku|q+PYEH~;aHw8@a(U7KP*Ikb9^x>`toYVY0|s!w`lsHG6QoX;rLM8l*WEGcESEleUEIS;n^eh zc5a6aX*tGuh$Vl1Iz`BLc*{bY*^T&GOhPQSYxU_11|7Fs&8t#FE9NPvu_Z^yv$KLv<*=H$0G zf1iD#n(CRn-UtQJ>O~FjBoH;Lr+oy{9`-3I_}92*`-a<~&fTf9XLM;yprO5r%o#zg zE*qxAoJMqYH$!Pcm5yx2Fpp_lV=4sAI82NE({Kc%gk6(Iey#;OvrG&8Jwv>ZD>D1` z@llCHZVG@CAmt;MGIXTzEQH{gZifpC^b?`A!ZY}2tA+&JCQiZNEm^BP^nsJ9_&wKH zJ3A&z8A)jT`KH6Iq^N~Whw!{LgqN8xvOuhd?_rnbo5e7`6GK8l4C-(2w^!18l$atD z#=CW7vAj8DJ}q0FsJEM6r2{0*g+8B{zkR$nAX1p)6&nHy#e_7yuX86Hfh%-iwx_1U z7-QtVr2B@{x$Sy53L6!p1GAc&4~=*AOOtAmOL4F|Zb&_QpHk9-rm1fqjHl>>I`pvF zYPH#=mp2Sntr}V~uZf?jXVGO*K_9daGZ!ACWfRur{!44t1SUaU+Mr;t=(52!6?kXKoq8pYJ zyBL{cG4=gaU)%WSR!RNl7CV?9{k*;F+sE0KH}9M~ADq07Yq^}B0!J)C0Y|;4)!~4i za|z4I=l1uNXXRcT>3(!X_h>i|aT3ub1@*r{vkXOV6KwI8Jx|o`uV~()0G{o$!@k`o z(~7T23`^cRUETZ9rffOE-fE^EsEa+4=Y_K1O9mOVE22xelLygd9K*womx&;9zq|~; zfUWHvN*Sz@m+Z)$H62%*_0C1{uO`eCpK1__w|X;L<#+SwJyovk*?0nEQsT$LUGLJn z!JA+C(aDu52^lsDwRg+I!O+E!TN;Jmc%Aal7;&`uz zRX!YSZnN1Dwuz4W;IuflW|`|Ts|UOK{Vo2CAOW+wgw{P71N6pK)4|pW2RwT+B~FEtYFWA z+{^ay?}6KLc=)5n=d0P1og+NmjxE`nhXlj~`L7sqOH>KWk=xZ3OfHT#ZhXIcHF8@) zLEO76*Mnn#`&w#)st}zujl>h63dRyS+x6r`!^!m6rf7w$^Vf%5wIdj{#H-xWDh!vm zVZ)FlK*ZA`3n3T`L6Jc#5(q~If zDs*xeW?=ikH9)$vCqgm$B$gfo#-ac(9Cr3#S6(?Pl?#4%-y$TnZU6|e55~82-;2zJ z(hWaHN!F=BywhI4i(MFh_R)mZ^xJ&qCKW?i z@$;P@YJ}+HYuFmc99mhAn>Y8o>6iP?fV%`vwk`P`w85-bPziN$->emD*#8Lqi(qA3`h|EbVqpuENHI9eZMg zr`DPaoL)4)iq_)w4S#a%UZ0psTYUh7FTM2@14dbP7e}kF?aTwaV7dzJ3aN=G2wr_K z;jQ3h6QsWfEQ_=hpj*A#C*lbwRte|di}GkGevAHWbhqu@T593E!Q6^fYJAwyA1&y~ znk^{C27;IKOoT2KL0Hw=+aaKsp;L<<<~ld={e%GhRG8zl@?@I?KB%ldE^`*cd|;8# zqcK6w0()n}NAz6}@ghsVXr8EsoO;@pZ4yCKwa^ z5rdE5Alt1|f>D66Bb6H{8Q7UR;xs7VkRFS8&|bl7a`jMv2CIm=jfTi}W9+f~S97BW zbzMQ~?d$O~*R$+7j0cW}uJf!wUbGOoEy+wIf)<=<9O15nmkErm(&2(zPp z#i{vu77j7WmZvRU|B3HNK`d$!odQlF11Xa%{$;VjITOVY*WvB`Jknsx+=-(xyd!yY z#AmLDtAx{pDjBV&*cFngW0^~I0hbPR40K(H5kPb)kvUq3T%Kg+YdkwWN<47&%!9pG z=Y35ow}fG)Z7*}zM2;w|W~CkZWOPTD%OJS|{@NU2*EHfw=ERgIuO=&TZ)E3YzV2G( zGcW1lKm57|6Qpac9YHgevL~FL33W%Hc9ub{KzwOENjn7NF>Y6nSwyXmR)q=Mnq>~F zGY)t71#H`xmM*c&=}+$@)dPJ9ChY^8m#x746mzn^jSE8 zmGhpm33rgp@>-I5*plSqc6EK25ee^o7S|w*ocuoA^lGgmJ0hd%#oJP0A$l7d{-d|M zXs;!|`vBKmUSzRv&R?ghTMXx6d2ApAs8thK(#+ox2pv1;y1owr|r3_jui6#@AZ&;KzY`SG`q z{5J{7-`>A}>HYkl^oULUCf?t~`#Yn$6=j?>|JG3M^4M`SD zCOt%aSsJ*tXynt9=yAhr|I~|qr9-#)qa@(Rob=P1Y z3#!pfr_9Lq-~$~T`Z4dIdw~cMB<6l21z5{Mm^50TwR;|uy^#O8vEqO3keuUHJROy& zMI1%+TEH6ENrH<5wM2<+=z+T-QJ;r!oC1u#B7&A}_r-AvkZny;95sCL?NG=;5e2wY zg!F*?&_t%g@Wv>_?%Q;HvK$W7WQ(KmU{oM3A2fF5D}Oi@rMr9$k(o%k@MV3_Y;jMnO(aDt6(b4RluUx(%@{fogq`)8bxr6_5f zh|BNk|6fw)@9F=ap#C?t(^6!X8w55)*I6A2XE+=FT1a)|sTJV|{>qs0R6m29ia{aY z)_3RYO}OK^KD!y4#2qwm)22_M$PUn@FG{rdkmZ8c1#!rhfm{CfP4I=7ffbQEYSljT z=OPq*wX&X|Yys77XEpzlr7O33UO*>axt`c<=i>_n)pn!qKZAEa19<;iXNMFnlT;Q% zOMU}mXo9698WBkfaG7OO9;>-UCpY3qx1o@ifU#Rhx1m5o-50s}$^*~@j2*;7&yv8s zxby;1$A>n>#&SF-YE674>aX~l7~ao|CcILd7zXlBo>a;-f4)>wFSGo-^uwV-=+ef> z^pFib1$d31?^KU0HV%#%sql8@Ysq}S#(?b|K?w4ku1~)4oC5qSwo~rv>RvjdSGMt@ z^i@-vyoEkzoY5#fkF^2#9x&hpb+rw-=iSI}t_o!T=BmGi+CLPOXJCZpb}7;=Uq?K8 zh5Z&xsIihiVPi*;DCcuemw)rCL6MQQa1a~fg4!>wH8ql`qH*vix(du7A4~R!TcZZ<#du}*BMCh z#Gg?Y#9xoG|DUOw(_p8_EOJ=Tt?7ixS!zj8hvFp%5=%aY`Q|%%w4BaQy`I1Qs$Sab z<0)_Ji2=C)`^yG>0KZ&*&z>*N6RX*WJVJg?X25z542yrnp5BWSqU7`xU0gSfMfIeIk3g9LzwUUUY04qjB(Pb#> z#&{Jf)%JN{6~6pZUHG>JasPnvO3w3!)OW6&QqH;3^aFI_)31WJnu1S@v(E;0JD z5-#|ptgwZ2vA9N+Z|=$-o1ZLQJzp!dOr3DN(YyIft0-pRDCeB~E!7`^MaVm+lxC$s zhcW1SjwHR(5si5dy2Z`<1(jkQ*yD5-)bf=BbcU{l2|rkYAgKkPuic0>4uZe56t#T} z-ID@C2lJQ#BDV=H5^uWd3&YEdq^aFNm|&{jtNGgp?JJee^t)y6e*WS&&jD+bymPUg zzx}Yy{ksW|evIoQxR5O1s7LVfwM0Jgm4eg`e}YKlCNvhw+@TcQ9A4MB<1<@Am92WZ ziqq^`u(kD?V77GDiga$I>C^%pyQuQjSf(S}}+FDTh-(EljMDBED3~OZ7T$5-R z@Lt=AwYRW+Z=3XSg{j2)Ud^(oyv3Pimk`K!ll#{7rq0{NPFp$2!)y;!E-nLeHtO-ElF}u;Sdn12r=bBv$ zzo>Eg&Zpf@|FJdQ+ROOo1Z70m4KgeFHI$nehJ~%3F8y50obP!l{yTfKwzJT9=+MUr z(Ucn*=d!Pkvemt0b$!3KIdV@ceK8#`^|R9@D3N=p>xme4q@3{y2MxYty*-D{HGag%Ex1omb0yhQ`!EB1a05D#h28G!PD&G^SilDkop!^L=- z9v#{iEL)tMUuAGkrpCi^)kF5sG4}(buczr^-q?N%#Atk87$P6KhF^|~66<4yG5I^q zzp=Kq9W@3+Re!?{ceX>AO=YVP1pmIC~G{ z@R@!?B}g_w`TaWS!w1Qj*-=<34BSxL(S$Qw1sIDF6c>(*RMx~hA%!V+j7;nb7Lkj9M@a*%pz&&UH(# zKbh+JSXIHWb2UTkTl3qB*MUke;Dxtj=NUq9lF4!hEzl+;7~CK;-zOO17nc)>Ty^)1#fH8B#WcENHeJPZChS%P6NLq7&i2F+|{ zFs!i1q@=C4Tsv#I5`GJ-cP{M|49#|a2Q3ud5IW;sqONoW@*nFZ`@(~(qsR`tI#^_h*a*jt2y&Zay*PmJxf05D#xzO`ny*;lbT=mprYbP0B(nhYNsvuTpr+xV&e9p|DgTRe|6nF&+J&e!QpJY{IItu?DQohGO;g4`lBns%2{wcLacYR>znwTmRt8A zcO_xEzM^W(pw%!0w~|bbAnIb%$;>j~z4iDO%re3ck}LolP>O?aYMZz(9jJRZJ_}WF zJvY%Lyv^EvR55e*X4-rE`Q{q>kq4T4y2h2%%w%`O(%8;{9_}s}GByB1xW$~vvWkCE z+E|R`I-S0eoGe|_aP)t`V^4GN7}Vlw5o>%XPf|Kr1WYpgPEw!x1|rAc3}!SaHI5J< z#M2a@Y3qk?&G|)+Dh+1oqI&s%*GpsM4st*`ibuTdN4^*!-6DE|S)^OJs57*oVFyM@18pY3+JUM0+-pA9>qu5ou;K{TU_ z<7iTGrY%cmJ=ML6*Cn-2HC^{;$Sm^&IC&I;F+MruV}dmC&vjluqlhG8lodr+sD?GcaiW7Yr6{VgU)P`*|IRme)ib5`GG`j(~?NJu0_`g ze@6R4g8uh7PX9OJpOX#r2J~!Ji;k)Oa4bS5D9u!^dJ_4-jLBAlwpT6!J+LZ?Nw~H2 zkUqZ(pN8A^Qw*Djz0bZh7{=Cq@tn;Lvzm9k^5jq*Kd29|D1l?v|EainB96dMf;8b5 z(|5hcndpF}E6ek&+B&1FfNe$2`uqEHWbiV6Rh&LZSM1%L{7^PLV?Fo*KJeXDAHfC@ zB8Gs33n02g&_q>&1u+tz1nEs@@YQU_Y~_aLAW+%a<604pS{?XHj)BU0RHvVaA1Y+w zSWAKiBA1?0fR@g>RRmtNxs?z_9?abyNK|KoUHlN}r?~B8+7~w^(3D?2-|FYT$a18* zzYtoY}2u`!rsqK3mvnhAnVbEa2X(j{`lw2s-h-y84-S3oW*N_6Htx$|eTEV%jb za*62gt0Y7hqy^GD%(tS{gZIlemHigl;?S^o;dK_5v5<UhEzxWVb zW-?hrM}<5Oiu)bB><%cvR|=*HWTy1h6f*!!->nQ%Y-)PsKRo8pGPBn18<*^unJ#w_ zj&;0S;+KKlzdr_b-I?VhGpmCf^#EM@%t%`@6G#el+*tRP-n)kz!H>mON6uj zRdO^>_NTroCquUWnWVRwso{^|O(cC=Og`9+F$KuViD9Jx(*u}&3Sa`t{T;j_jL1W? z5`orV+F^pzk+RN3K={bV=I(@!%#}`LYqWn9mDLri*KT;%EzPU%CZW}ZqSF+vlGumB zztpZs;gUeTT8Ixss2gwLO9+Eo;Agy7g->?-I{N#b$y~W{2{og1#5MhWf{E95ci;@y z5bK>S@mqq&uTBz533iAsHyt53H8>Y;cuyP|)1P!~e!o}F*AA{;5Fox(_j9?$E(XjVU&H4wTxQk?A1W%l=!n-69 z`Zrl9HO1bWnkQ?YQzO$xX2!c^XC971zbQi!rbQo$k)Q+7%Wu8=8~{QhO;`9Mf`>tOZxc z7nW@xsO7Z{BS@9Y_KNE!mFaRxxbEgaj)gv>;qJX{*zC59+^IuKdY~oBluq!22azfe zUg@aB<;1>6ex>6&TzK4i&ku$f|6Wdo2N9YXvTt7(D96s058n(a=)uV%A42gG(B4IW z$VVyy83mGuEaP*}>xnRCzjyfTzIo@%6KVWnuO^0s0OQkHR=lq9=El^WpTXq|B!4Jq zPTZcV!H?kTvGD`MqoBdpl&&$byq>tw&!qUZFnu3^YT>F+YR*1;36f=3*Cm}i#>S+I zb4dPa7{#6d-})tj^hN;0^LRaivy+1v)_0jbrG|LAhLjU~Z-;lc)K9dURKG>#^7-;idUOUxbtf>E^*A#=J6p)&$3zj?2J*R8Uj!?YLN>${8NJV zh31yvKFHUuqlc5J21=dqflJU?+hru$345v=+B|hP6e_ZbyY5`gn^E}%XXbvoKeyS? zEvF*9UybYIRbX0Deq#L(o6D|&!x`wez zy}fkQAZSrp-c@ksaU5x}Wb$Vk)=^M}y!qcfp$Z@oH6wSElu%sc-h49(pg;8r)Ho1p z+|Y(V(tR^l#6~AG28LItB^H_Bz`V3NDN8Vz8ny$&7#1Apv!iE0Khby(tYpCu$i20Rupe~sF)J~1 zNX$j>!LfESv+%MhOIJZ-d^Kq}Ju%?Zs|H>^gVDJ9^6c5<+ZioQK5$#oEu<&nE~KtA zkHCcA9K*qYjgtZQOK0_Y#tlPXcy*dLlQ5rN9Da7yaCq3k=S^Nf3>+lgmV(_R1SlM* z4Z$4Iv4cuX9+?d!$@@uo(2}~kmQH?-C#+Uy>{PG>`eR&?*K1an^zS-cj( zmgqc1J{&o4;Z>@4?saF~GDnzux~N&ivs%#4OXMsjGkqtV zjG3N-96@>prh$3`2l3jnBU90Y8gl!F4&iO`0V^eioM^LAh7<$;_p*%XmL{n^8%JQp#eTVUyZNKl1(^Vy{@ylMf&q}&+ zE>HO~?wV)K(eI7mGGMm~4nvn_K_5CaP3PDg++-}YI=Z`mYkvVug9 zq68wZlU;*`{cslG{{|oml|}wg(+fhM46#CbjZ8-`X>LQY?5~l$WYJkCRAlk;@Q+Oa z$?DP(+^IJY55vo{Y@=Tr9TZ8q48HkXpm}EP7f&vC7D&f|93eI-W*CXNlrOg6wKp{} z(o=B6jN8Mjzo0VFQAR{U_O-r5L-sw-49?{{R#D(GWC!&T>_OwDF`dXAVFzpW#`+p# zR=etl#NPVv>5-3UJ;Y+6{0!Zj>7L?K9heM8OmcRf)63D-k@2hbIx9Bk8%mKF&gu zC?*NJlH()X&13R)pwsA^!RF9J*d?nqyW3E76G53MMM~?W|AK57>5{Wo(AIhinmQi# z)5gl0oE`7of+}CNXdT>n4-CtFIpuSM0XDQw6m);?y+{LxVux@%_`++#<-I!$k+`qB zVpqvO7pOcu!|Q2o-zT_$#=2tu z@?BfY2?aGMRiu*nNsYX6U0=mM!sUBZ3KOqmElN*i8J&7D%ZJm$fjb~JB<9`X3%JAf zdY6gZ?5n`uF^Sx=*6Pc7q)VR+Dk|=t0?u8zO8{uDNsU7UKyOqR5_FCaVBa^bD6y9( z!=cMN98M{(%(_qNw4P486&%hTiDGPiBNjBWEC2SemJ!Y{p1Pg`lS4cV3R}hw&={jr zA=SZ;VMO%vy03?k9|d07h)}>^x$l(3j`EChdw7snk4D?=t4EO8u=u3qg4+;5s5kqx z(u%^kUF5zocAi`vNVj*>>JEFy(J_wvwa;Rbc1`E?;8r>55Y^+qFo5# zk!@2euzm%B{fpSJtQU zg8>RV;!mVF_4e(yU*2a(b-T_yL;^j`R~BdTaAdqEUk^LVQkOj5QlC*iUhDr!WB()9 z$!(=TmEBhUxl{p7RC`8qjX)+(`g@oV_?V~8Vf@;jlhi8TwlEgii~IQFF<)CuK`p+M zWi|rY$R_VEWI1!#?VRlu^FT)aRMx4lq?mI=T9BrtWBMro6_;dahvB*a-yjnZioK%dM9<4!q>8;(U z_@}aJY2vvN*B@7llI5!2{JT7`|FIpiCv<6R@jxH<2y{z>wmlN}3Sk;Xe2eIsLhhz) ztC9yO06Sv)IyrYMv!&OUEjC%YD&@bcNB`4(WOtAo7s7VyYSGHzx*?8$?$@@@$g<>r z{X>OdA?O(ghfuTn??|boIe7lhj$g7DTohCboFntdZ@3XZLYiwguD|resz#!Dh=<9u z&T3{MyhG%Qz!`zm5*HahaE?^cp@o}$C54z)ca&kPDfSVz4=?$m;yM(#`&dnX(%3Rt z49x}^wj~M?2<|a&5n#B+vmg=4q{B?q{Xlf7oyEt8J*wV2)jPo{r&MpsACWuPA#Gxm zXYu%y%p7;E`W|vSi~`7(c|&o>B0)v@;&uNUrOO3)O(`-{#4<8IH>ZaW(^>p%uwqV6 zyKQm}d*4d`W@d(-w7!w}r=8tmgL_xBu2e8Ak~K&-p`NvOpmp*ftJ~qyMJV;ON3%gM z7-fDcoo*SIOpd;`PW8C5`_YL{tO7OeMQ0nJD&U&VC>`jR-3LiZ1PuQa^CAy{Wi>w@ zAnIy~LXf3&)W}M8^&FX_vmG;X2j0z6gtC(lw}pB#Mn%TFci&B83Rcm0DMe5qr6XJy zV=2Iksjf&JY9d|i*TtTX`ymTO&hG=r#3(yEi{BtVpufA8;5l8}$e=#oal%{f0Iabh!mh5es)# z_DNva_($}MvfNQo$LE+S;$c4AS5j&W8z>}|_RMIAj`+gmT2^0Y{`QZG>cUW~375vU zG?{=;nHqL(?ooPjHH$Ki@@LcYl8FnTU3YztohxP(hOuQ&g*GcCD0nnua%w$!!t0bTlGWP;xAlm5$R2ZdaE0MO)Lp$ZN(}+;exy z@Hl(UZxgUM0PPoA4bW2aDTsq&E+M$U-c{lHSbDSMMRv8oZLgyfOxZtJ^h)m54f(32 zIiMI&b!v`JsKkNC%{WpWsT{ck`e97dQF#Nl6o3nExE#I6x`k@$YDczVwwLQ2A30tN zk38ajQ!RBiKU3Y~%AqJfk}H85(WQoP1s@~-TyO(|o?NybZ&I*^w{JV|+ICC?h|Rw= z@VJ+s!K(Cil`-y_2VKHur@JpRs$l&)DCLI`RA0fzJe{>j_n-lYI}Tvfwc$|lGjH6F zCh+|T9LCQbF3!@;)zdH=szv8Y!7?5_km*UK07lRT2$2J{-ZGYUW(8lz$m0D82AevX zSTN<_t3roax!0=0_>Oxnn}*$I!M?VrSYG)On zZ_5cOz4xlFl_5(^a0(`G(@O%6+8A3xq&bVXAtC-zEojhup*1Ak0{N*~(#J)C&cj~0 zcPT(hm3X$Cf~9?D^doJ(?<(Ukev??ZaC^h@(>a|a*Ksdq_WDiIprxH#R?C@@6 ziUGD|o2qAIGn5Dzb%cVfQ#P=Q!9ZaktoB!V`8sI5X&s1mroGm-7Y&YC`>x zneK6%1%9YdDMg#l+r_&)#oxft`;olGB^hn}+w=ER3CGQ?aINL~_% zrMod|cnuu|7seYy&Fst4lI>8u;v)K(FNTH8 zM1w(eBWcuG3b^s7mJJ%=+Xj|GnSyM-!^)T2dwEqM1rpb&Z{L@7O`X~}ln%14K}45= z+6aen^@6vK_=LifrHg8WyI1v_jd}uhb(384hQ!wf9}G4IEq#Nv3}GCXp+uQ!q~{DA z>aG;)GUoarM?YKiSOLz@f{}Nwe}-Q9Fw^a6i;)+>1t|!OF&GzuqP0LNM2o^l`>!JJ zI16C{$}#&r+1Djo%6xKO%v{!!%q{S==au1CLweiAJzryxhXrV^8G>{{_X_Ko5Tk?W z;v`Fg#^ciPo`}R~SPC1vReeX@H3>@x{Yv`_yE1!5qbY|f8)H?JodZ{(cnvU!6j6)+ zxYgH;3cRy7{==2wuOa1Mexkscq2kte@fV8T?# zC*Bko+wE#RmL7KB(T7d^a_^g&x98pOM`{cAXH9-htgeayV~h>RMy1#JP%_iEGPvj@ z+@*LL{~T`le6}V%EQ;q9y54Nfm_PZ*5ohzatKO+9_p{!w9YvUa$9rQu$!uRh3c60D z!hbI$e^_Lh>irQ@w_I(&W@*|&o^thN{rI(}l3Y6+IU6uwn032Y1mYv~lj}(yVm#Er{y+BK zJRZvb-yc?z-DJ-?6{SKXOW7usElE^%O@(YBWXqVVRMsSfBBrcm8~YNO>>|m&#h6h_ znaoh*VwQeypL4$FKKJAMx$pa&`*)x7c-)W2`C}quT-Wu!-q&k+zMik=YZVoENAU`~ zy-wh^9T8O>18_7H-4>3csga>JPyE#pwu+Ax{s+;!VKvWZJjKf)*M>X2pa{xPbltgl zq1l?jkKv@5QevKCXCRL^!!3I}rGvbbL)`cBJ8iWDf^|aB-m4%QJyIwYzb)34pi{F zh-a_e;$*&A!MpHn1EQ#WTwL``{IV&> zZ;qrQtPIAGTHl6_xy22hEGJg#De8x8pZrCthvCJt#})$SISgwK<^iF*2pIDd>~4qgisHuuFW?j?&`%G{r6Grx4Z8qiZ<^Cj!kq z&5?wiNa;>45b4bfI}z{Ie=l>E?DMKT?@^Ipm0QN{_}gv{qA7{!rr+z_3?rx&0c<8} zihI8F4@XVVq|YTj+NC_jRPPK!krAiUE+!7K8s@sW6^Py2dT8Ok3Wq&Zz6SG_*dty7 z49I^Pah#O{xquYucw)$v?w$Uzo7AlMYQQ^-EV7-MjY%lyQtql zbnmD-BYS-!gmRr-gV{`zu%I$kjeeUQL=OO0yAyC4Gjp?6W(8u25 zcS#xBZ75HM73V%%?zU%6fTM|E4v!itL#;B97wRsG&&_@rV@^iz{PtQuPN}zQ)-&Xs zQ}XUBZ&ez6XV>vHgt=*}2PlL-;B#&x4YE1C)^<$6eWVXXB@^&n^GC7Eb14fOK2HkL zIpf5o_6jQ}*EFBK{YA>|cOKo5^$5$&?wXaOq3&U~F0Odp@<`A(jUXp9T}#CT%o0`1 z_q3_&H(b1OJo%v*AKXMw1827n+XmGTyFhW&!2ir>vl4l_QNoF_%#W^1_kX6nvRZt= zW@$og&$i^)ZFMGIGwQWTbs$m!-44Jj@YBwlQ$y;nl(=|G8JvQX`fwr=F!y*_* z&d~IO3ai6bYNj4OA1=u==p0@!y(|~yMmdxH+-*FRp@@A)A;|Ueq%%X038;IQf0Ir_vTe8|m-@-~( z!R}_yLN(nW;C7u)>O|hVbaYtXJ-pm&19i9ABvV%U0}i`#T2222e_Z~`84(fj!vd9a zmv`VsmfKOAI6#}#RA@J@b|8x4N20}agsEO0yJQiaGBY+PEZP@Kjo~?RcirwGdkROj zKuxUDS4OTg?Px`(8meh$v!z>o$Oa;xC%lv2=HvGFJ}*sXzSr(asyRGelZolE)#oQc zOkoBI(e6Lx`W8YmJ!ti0e20}-m@poHQwAvwrwCr+86)qlZ*+^JNSu3;qNd)lxXQB@ zBF@;2&O!pdx=04yij{1*7x{p_-7@GB?bg)Z2fE#{wGOp1FIC%mbd{#J7oVWyFdT?l_<;k$+(U`99uDBun!)ChF`og&Vz&6B_ySD z3Ly$Doe)WZx3k(xte(icv>YMne(;`WeW zVsn8Z%>hHCH9*m2^KW_>&dRL$45p~hoLv+~i;7+F+Vip2Kh92v^W4~;xVms!yH7B8 zh7_m<^y!@KQ>JKrLsLNlJNE;>v4hR=hl5Bt4~kAD^i;ag;4cD#-A>&8%Ti=k&q5E) z31jzPi93BQhFy|i*sMd;oHjgcVV#jd&GlEEs5R)V3^ zH?5bh$}X{eP$U5Lw2q^H?PhNI!$FuT`op1Q{WqscDA3(cKP(giY2p568GtwW=kr7l zWs61ZjL}174*c^qj{MDu;up`x8$39fjqCUL=S%#%6SezQWGvsc`@_IO^*>*u^+*eV zA6my+Br7sHB$0eA+u>+EEseUYA{TUfXkf|M(Myp=d+ZeM9|{35lk{I#hrI$maR{0# zoD~L^`5{J!CgP2iX@()19%=RkD`6hu_CBVg3sQj(k^&d`+jl=aem~h>LpNkfLSe0O+5~i zsKsO-f>2oIpxVv#pQ;5lDQ4)r@bkH5Te^EslQIMvZFo-_*#1)+AAGkHMYSeQbhn~w z-7}Vq$^KH2S>)&!dz-a4)ab#h%MmZ{4I*mxHrC1H?ONT>`J$EFLd<_zWMhszB3oPvmG$=<05YAT@$s3LE4x zF)()tpt6TqTw<6`sI?4e;j`!t`E?mlFs1**vRouDMOyQOY4TH@Eufg`Ri5vJWZ?e{V~Ecl6wi0D5}XE3c^! z+5V-TtKGp~!P1u7K30T%PJDZ?r?1sU-AGn=C}B&;zpW(|7mcb(YC|Uw#PlJmPiyFq zIoT>Yv@S&V_hYKLzoH=ISFgRYKiLkW@xuR*(MqMsbXuz+aBNQZuTEnzJBLUAst>lQ5%&M|+QR&S9U)TflBVX#+>|?HHLwP} z{8YRpoIQ`)axiHpoY4^~=ue`EkX93}O)3hi+>R=Xd%SFW%5>sO7!h_vQG>s-M4{Wz z+8+*J+`c4&6#1@eS=n-nFnc&hquFR*$tz4KP1nUm{e4yGjmg*1esh{t!auvm0DZY% zx#Uw>(n2a*B8e#%{)gk#JkTKi!!h!QW8Kx1sfvVkSJ`phZ&8dt9Kgn{bP)<(A6QwB z|GH`ptc@vK2t+5j0aOIAao-_kC9qll{AbWZGh%`3_Xqw{2igB+l^@ry^fzaa^p7h} z`aiB1vbbs8w;+kiwECMjk<=P!?#IA}(x#B|G8>r6K52IPx4ew^hagSd-@J(;d2Sk4 z+^@5Sz_=1wpg@$a#ZnzKReRzL6k7l0T}`NM5G*<#_3ar3*l)rH4m8vI0D|p9w0a|R z=g;Z2oT)+kX64y|HY|m7sMdD*>7-Bi6iRm6xLCvNu(h?vFvd z?1OOhkkhh&_iL?zDSbUM56PxquBj$<=NG=xi-?lE%g285s=MOIe~MA=O%%@m7e=}AdMP{& zoN^SaVUwB{)P=07X9^dQ4CCejE!DQ()QV=8$aD&%!dU*_dX7^wS2Je{-Q3+(swx#l z6jt7bupj?e_XaCx^E}!L-s?pZh!><+&Rd48(4qN;XHQb!2hG&wKG@h|ZR1@0t?P{` z&x{C{fQKN1C7N^_x5oJE8GuYK*ve0tBF7e>V+87JumRM- z2H1jHrvG*SQ6zxNwvM5}E#3+dDgSbZrL0Zs9!2_jtY(3|UBniu`fG2CZu;-P_O_%I zQx23nz$2o>V*mAopa;{$BD}c&=OF*HT>pRAT}8}I5VRXROQ;H^h|dj7{^4Nr8&H!@ zP;HZFd)74x2za$ML&W06US^E9QAn!gjX^&}LCQgKq)xi4z#>a>d=(E&mgQodB#vsD zwDY^h{j-pb0F}ZTT z@3-E{3X>~607(C5Q|%qYl%DbR4`v*|1k!wad9@7e=FxGPKNq9jpOi|R^7G5z`u_Mg ze$h|kzeXJjf&+6h0r}`Z?wihkSrz{u_xFEtj{o=WFaPhHBe-qDp(RG11GKB}p~kw{NETw#L$)%t^nch$Yl>9COh*(AGP zKv&l8*XXGTPEZIhH8K`Ex8-kM+^gqFh;_%+4kG?2FTZF6QG!%{BI5K0q7lU8H_I6G{+c-!AWQe>UFS@Bl?6|uguy~ySkwD zvsFq@1$qbT-U+(|Hfuy3+uG!~pYtAo!RtvGtpr}9IpjTg*&*hE7mn4;)(gb z$`XMUZ(qhc#3}K=TWET{DHEdnwnY&;jN@t%#kf$_-a<0o{fGptqGhPiox)ast@riF zD+wR6r{9?h8%swnDo*8%Z;9ntn^GpB0MWvNa1|X*tW}L!vYWS(W+jlX6$H3@`Q|~_ zi0@i_jV_jrQ-r6rDrU=@pB{H+PT-sNS8$1wP&F|WHL3;O7`BL{4bY><2qbxG`BTLm zpJrK^5pR-$_H^!CDa%Vg-#8((;|_6a(qt1o1K!JUX6~(kzcTcHpsA0->}*oPeDT4BxTh zz}yaY(-Ls%({x;QWCqKsAEzVtnuZRszRagg!cCuK^XV~mgYAH#)JQ?z2vO}gwK&Is zBBxd_gU>@{73jnC<4LuWw*=}m&Ta>Zh(|Q@WFFkqUMH_YT_So8|F-Zfr?^j@#~j&z z)3=mU;1&0_tp@umWG?(eZ`tl1^DrwPE5dYvUw}C2?j`<~6e|(TkvG+qv%6iPp!B|(L99YN(+!*-S~L3y$ep7UF}s^9?(9xyL&5`rJ?idh!6o1B z>bw%}O89k7@8WRMkxGpaQ-F>BN3jfBc9-J+pe6qv5c*G}CCI-)bxJnQ=O(%C`2P&J zx)e%YhLu5QRXdw^0PmTj1-8N)j5xJ~sW9gMVz|5+O`Yudga$6&Ikd-$>-&-7h8~gQIc~ncO)t9t+GTUo_40X- z@7rU_zOnZWsxZw-Kc3)kvQF^)^ex|gnPwqSEG4(|#>K}`Jah4*_z%mlqRW?2Kw=l^ zqX##>NiKNAJVyPf(yhdQg$MbF!|q#Rko3P9)PGn6@b$k;$>YC&XbQd^f{z_xpUa{R zu-|Cp<^gKk79UZ+qNO)Nng&0B9t-uq_+8{*%W+7d-$~Nnk%Bi9BhJ32$tO!&Mi!h5 z7M-2y-eqf&&gHM6b-DLK)$Nl4!QxYPs~!wBZ^w|tO)M8X`nmAT=EAMDdVwe{pn z|o9CW@8WrEVIhoM*JnfwYyo}ldf3V!<8$4c-*(^WL5K(miO@!Zby|x1+IKC zv;&DB*Uh0^z*6!bm_)Jp_(=_}!!?4+wn&85z%G>oLRlE(J7bso`ZZBbW4tT#SCdmif8lA zE;tG%M{^t|`lN?zl*+$`L^+xeMZ{RuM;WZlR=ZXIZmK|KX9 z8VKxdAyye`2?wM9aG2Lj{nee#N&@hZ2S7Q8+3!Gv1;UUlOZ;MPgGZB&6EcoSb++0G zr@Fwxb7~XqZ(kGW+GUL6@UR8dZJf8FPa;(>`c1M?uiK9w3JiJw%lf#C8^YwgSh7Nm&X zwsAI>mT;gJzq8loT+i@B(lOF=9?`hrTM% z^4LNE-iitaj9^5J{`msKMslK4M@TYJ5Mfz&Z)U=&FFX48tb3G_yUaZcm6Y2Tst#6Y zZ8$M_pw{Ck577I9Z(YY8W)Db)LLQY(K7yuaM_x#*(IvN>yqBrvaY9bYLH>aJk>2P} zoVriQ)NH}asqXMVYU|4xM^jl<4Z@=Y zS>f!;``e9$0^bU+gIsC*L!XVr`Y!(mvl4NYUg5cmIyP$h?|k)po_Waf;_4y~fAH#b zn0}sm3pr@Y6ys*|TQdMs)2hon4+NCr>m8F~F=*9_Mqkl(YIi_@ugrX}j;czfG2)4# zyG@rRPW!$1C+u4S)p1|~=rc%?jX@mPZu5rvHBnwCT@3+8?WdIgv=t-nuJ!z!E=Ave zZGX~2?C7&DadYH*E2c~Vb+r{&2ej5v!_2F0R&}%t4B&{(9ewfer`Cv;Q>SV` zZN@eDqpniV)|jK5{CC+xYE<+c=xP$Y->z*riCFbJvHmX8A|SuJz&-H8$iqcy?UpiZ5tNM6R8Qq6AFO;$ebyN% z?$9*FHFf=Ip)CibP8>72ewhtAnObMiE7peG(1ad?4%J*_h@6-y3zzZEiq?a8L(3x=g4fGR47IJ5kV( zG=!Ca4QNSp6Pi^!QP3yIQhYhqzpQpYWy$uaVzR?jq~ZRz9>!Kr$Cm>>9CSW?h6%1Y zDhdRY!5}jdavd@}f^izjFr|)1FMPz;%5)`u+0iPW{#0*|U2@Xpmj!WiTYczGq_0d_ zfc0*hq>Xo)VdcEl;iC3i!5wY0s?f!XJDQZ~_!wy4g2Q}Vi%t394c^%us#s01w8vXS z!no%F4Ro5ce2*wrDtUoQuZBns%cY+lWeD%XiRC!W!!n)%dgaEE0Ab)ZV)qiN34jB7 z)qgw2nW`482?_jotrqWFli;N}ef0YKvCxou0s7}tuNn^AQ8c<+p?sO6V=UcqpULZq z4F+4taioUg!R9rM{8ywO@0n_(sUm+&A@pkLgJyAsZNjZ`$}@zyS@ilzVjGeHjNIZ+ zHbES^?!WVUjxGpb9k60QZnYojXb|Hhh&HrS^w4dxvWPui8Yb$Et(RUn%~ULx4KzVsNcfl! zn|GdJh_QMDJLR>Lk@5otE?G_h5*3=BHoshGsJK#?Lq;BbH76$TbeTkpg3c1IBH{g} zOHIJ>@KObdNvLc60r-cH2Mdb|XH4l4EQ3O|FTCV;9A*kvAC&e?pB1`r2Ugl-dTmnF ztD!i;Rk?_HT#~+0E}YeYRUF6Pd{?feY^IjO&u>rSzn*J9@VNZF+f5`!Om*-$a~*7p zaDLRJ1zX65rnB^gy5pVKT(*ZJO+MA|>9&sJ=^rPZFLm#{c=JSEs);yf%>MY}+s}Fh zmf3vceS`Q;q%*+h^8o9EdRCDCYzymouwFuw}kn~L2(NhLdZfId6 zd{`b?qKWPvV@CZFKWibIk+0euD48yfI23v?%FOwQFIN!rI+jSN4UL1qy;TE^Qn;zm z){t}F(%sPu4Hi6pM(V7>KOF7k4@${KHt%FU$UPQrIHYmpFA<3(oa>FhvsD zHJHj{pu$}+%Qs$I@&nWwMPIyoaim-S#*|jx%(Qcju9` zj59Q$R&CTZG%sKsrxkwJ&Jk*Ete~>fJn|g7Rnpn+`PIjBoXn+5y#$-o1a8~_Qzby3 zT<8FEdVtlBW5nSBCH5<{ZfpfcfXHcDv5tJdkTNL5z+J?Cgs6f3K*nj#c;SBG;*lSh z46+=D%Yu)gNIT<)zAJBQVAu#qVe6FQSM)n1M6g@$t55U}vFpR|AdBW0< zSMOeaE4xA05&m@Qa$|LKob~Li=fF&`gK&#Y@5MVdM%xM732w*-zLR@>Bnsao6AD&H=qgK`NJWL z(Q-XFRpVHgrXqT8cCatN|3+4PYHifI_wjkc+|d$G-s)dX^kVRlI0EQeu3GN2Al_96 z8Scq>N|0%$PvnKLtGx$&m)vWwTgmhVX;t;mUS7rE4sRbS{chB4Kn0H-h(V(;%v*2? z33>MtVz~K=Zy7vbTQPV&ebutwZ2tAgH8TZ~DwhU6xh`^3h3+*14hJhh6bq^oo973^ z7844Ck^>iiyf1%eE`J11El=6P$llfKIbPSIQ2TWDsx|lQHMcL8i?zQUXxIl%G~5tD z1X2mFv3l4~x74EM2U~=HWYn*O`OShJg8LH6ePvdZkc4ZkdhjF)R*@R= zd8>b~(TiLBCm}XJH&t?;ovvu9G;fmnlBrD-xUn<$!Hko5)6hb){eqo_o^b=mqO`Ao zk+2Ts_hxT-@K4`gWis^3G_L#v`4V^9e4CZ`q)gQP#%sF~+qT=UJ_>ld45)8elVbmy zO*6`!Eu7dv^fnZT3Ba9fd(Xl#E#1j)wmF+$^S_S7Tn&rLc<_1a&h6(P+}!h~knjuv zYgMzY3xH^aYayr=<!26M<)wNw-%w4C{|h|u*4=t zrpHlv-<4PE=^J1hN5iIcg=#AP{!KK(8FKOFr24)Hl@zw3JLU?Dc4^ENX9v^!8^=HBdo|23eAsG*; z#WRTJm-w&#BvYoO9T@`RGwkv6th*==gfP>T##Q6Nz#jXp9I0kLw9hSV{vP@aZc&4h z6!5@!NBhB&^j&tJaMZK@@PYbh$loGqC)0q2ysL$ZMvEBA_X`;AqSjTA7H(TLD|CF> z9lk?s_bQ**-az%9)11Gz#_uoWUOfhUc`ING5wL?q`X$UouqUS?p)! z31wyA#GyrP9C}#pl%be*cv37@8zV)1Sg8}!Y)7U@S`YvoV%mk+y6s)#$wQw*1vK)7 zlRw5!QE?qfF<5;f;7(}2De^JA6v~SFuAJD=74)UDO9OQn{q9ZCQWis>Jwc=f5&+Yq zo1RX6Z+P%yDMqyL`y)pym5;?PogU9mRdMonP4X(2A4$q8I{ghbR{=)!pe9|}yo(GS z#Pl>>hT-346?C=Sl=PeA&A(O%8^H1wFN+cByK8)@CKEJh?j4JC@UQiRq zjzWAbLfF1^RIMpR>Xqpn5djkOiSN4iGOR7LJetjVVo&V&Fz}|0r{mfgTD{i#QxYg& ze_bkwh{*fVqE7{CP}^Y6jw7lqY4i9UVPVr@A&XC+UL5K=HA6GS!>mVF0b}u=g?)8Ky{VL^_6#`bx~uDm$J|wXVF-{u;jPkoi`Ebh9`J zP`6`yW;GN#an!pQv*8wbsg?mC*grE6V-JVR$(rs?33E>#R5+BCl#%>Q9C>ICF*Bw7 zd3oY1@K{PfBuJeAw^D_CnzH%Y?VASAjNgboQ|+*%Lc87nQb*I&qwOcl81;6RrM|L{5=nJg!gv%?8&dpu^y4tz`YJD z&v}CEM>1vk0jExwu^pPkP?y<{r^kOag7K1n_v3T*uHT>|IBM`RGna7&(1bv+M`T6W8dNiFjqgK%Aw~5@F>}v@1g;YQFZ803zx}a;EqwMm`Ym)C;BrEFe&N@8)mz}HzcJc}4Dv(beCMU6!D)5UBbCG-p zopM~s$VA+Bj6$g;X;~a@G9Fr{(AVq6L#aj@e&_lc<=O>aZF9T1=QViX?E7-IxZRwC}AYo|M)KXh zF*X}sc;B^wu{~Ly_SsxY$yNAWw^jxS$N+d26^eb216qQ2UMo>RUjpw*rM3YijJz}~ zGODV!5aDZlE#pZJe@tq>pN_^>VUnGl?WM=&teb=A!A8yavt4h$bcF5Nw1&<~9mF^=_KhFrz5o}{&<-`=QHi-} zX4y%95xFa#$8{&CN<_+u;)f2g7q>l9rlr1xs5g*H+4JULzR5^47frbxbPhjA6oJ2< zqt4?HQ4hqb2mBR3BtN38Kwi1~HmrD*FXA3lkADjjJP}dq-s+x5WCWqV-Yoxh>joM3JL+W|qa)vQ(pRx#m}2 zPRmxSJ3U$9J0uh^)IgX9Fzzu13frNZ6oZkO&+_%@%+aJ>&(cKmz-N2BZ&AK;-c<~l zp5LlF+ovUAtfQVwpSnB4+y2}FrpXJB4XakO{?3$pH$p%|p_O@>;19|@9Pj5$4` zSZ_PA%$FXVZ2kH{Ot;<*885~q@EF(_@&IWgv6|({jlT9v$}o|42CT<`9JUb}1I^_VYj7O|@vxKR=%yE8 zPcgcgm+rVe9|#IBNRT-%8d=z5CNWY^&B2e4-A3Vim7h?p;!ByqbNvck8^Lcn3tFn( z{UgOBlXv>RwCgt z;oj?{Zd5FG=ZaOV;}l(0&vdAnT&$2nX)ifl+UkD(OTOu`z`>rlkL)fS)iwquGK~8e zV(d{IP)3fz?c{p#rgHE%ZzrYjmA|a7!kdC#)16mEbRwHh`q~e#?W3Z&6-vWf2Hp42YF$th==Y zmS_Mv4_OO3t(;6?NydpV1^NkChvd+*k7-SN*7|wydimIxs=^u0g%$#?FY8^b+YbGz z)7$-e%C~G-8e>DSV<7sM)1zCmUSK)d1GhlIA%FBITj%?@?y1m_wW~*eO{!)N@{4;` zYoy*hoTNfBpcG_}&m-eS&qcmg6## zYAU4NGB0uBYhd}YzmV$37+tDWC(vN;3p4z{?!CSq)VjB}!gJb- z+ee7!=TV@10v{WvUuEU!t1>uYA2Rx0vr@q~>E#%Vg=^vBxUS0s?GA@eoq=w({wn)+ z^@wnBkVYx=Gd-W%X(y}5Jm)|n9}X!0V$?lNX*3~D;ihs={xnE;`Bm#wb^|h z_nXmn?l-U|TbeFR&(uFz#`iF)>A4{8NJ9Y=S)J5Tc9L}I&h|#y_O;VlJ&kXuze0w* zUZfdak?W`zUN118e)6%sj!teXoc0D~mm+?xC-Z z{91VIzHQL`ami&$ic-oRqjQY3+nXYmcQ2#dh?B$)LM1xh@F?8Z=6LB1*(In$a!O_G zR(8J5^~{^vR4A`|VfUn1j*rxZ0413!@#D-TR{Tu^Wx5LZ15azAJ7_NLsW7_>V_Ml# zOJ$!~Bex=dr=klg-XizbEK2OSo3#IcV&cdqge-XrIc7%ztaL6@RtjP6nY2W9B;)w^ zP)X@Je0tT#nwt;5UdR+~TAQ_aCz7xC_<8coh5Ftjcau^Hm#tv6q$LnzadE&i{tUe^ zrBgJu=;re;jRFkr-73qs@(rx3eClScRZ|`Ar(owlTWoTIBIJ^WSk0iCzQu~Mnt<2& zLXZuMpg)d62!ESWipQAm91d1Znf}yH8ql8lFq7*xsd##KV&)$8-lCA@tk5-c68a8m zn?6680H0+F&&?oA#s%{xAkT@#G5Q+XEqTZ!raw`fSr|4L@8RWs{=psd{6_#nwt+r? zBFEZc=;u~KJ~WL0Q!^N<*luKvZ5%`Chof3N5YXXtw&(L?3s*$b!!CWh<1dyI*;%o= zk2_E?qQ_Z+Pf7=p0jdP3Dc4S7CaOps!)qc>4I+40^APmG<5af2EnL4`T$N<0x-n{F0VNLG9=Z>k*S5IK{3@@?%-8 zjw6}Z67m*bi`Q0vaxCb7Trcr`us>j%kL-=<$9itp<(_XnhTKdsKv19!UO^sgvd|`DRQsM?DQXF>^gO<=BHKhgu)s^A7T6{EtBw^{(9eQc^kgrzw)sI7? zpeaTT%l%C~EQXAV`_))j(-PAz;kEZo z*=V(Iiex6)p7ZtXjwcdsGG;3Ivf)VJX^ySYx5M=`opu}_m~xCnw!VaM=c|W&-5M)~ z)5S9B7w-$)58ikA&J(|b`FB3br@^4=x$QQq*brc|Qjm|;Lx>OXYHjVLtr8rkhCUda z9ojdN?OvPwC4J_M(UY?8{Q`Zv4>sKMZ{5@K1?;L@SlO_cc^rj7TctCpU++z~6Zx7g zwDU7%B}1Y#b?i1WXWCQ;`sEpBSKXhSm{thCaA-S+-+`^ySyx&3-=VBD!dw|Z??5R+ zFt=IaHM+~ff*6h6n-q3;;HkqQ%|@qyNyR9S!zEk}TiHuTxOq?kK2)H5+r&-)oa7?A zI@ZigeK(#PuvSD&pDZJK+mm}=%{RzL5ki!8st$mGKxS9|9qp3)9(x2@xlRyeqoZCodM~rKI5O($!fv@PH+hTo`13v#y=dJBc3;$(;y(0MxSIreSJ7fPO*OZlP$;H8lts?3xT}aZIs?^l(*&#@RT=yM50Y|^hzvV z4tegof45uFZ;P$TCB9_9y?U6|11FG^+JVw=#%U@#0b^XSc&n>nT63Deso{BJ^X20m z!g}0~KHs^wop1y-7Y!nmw67F^BX9cv2a&TwzFYKrw2uvi`Pig*h?uz#ftazHqvb z(%Vh#HlGmCT73~{`OwccgNQ!=Fc5nDi=wg%^+g=2E@GwPg`h=&iyK?agRs(c3>95AFD0V$_?O(u&Nq z3e>v~<>;n)94X!VEM?ZE@`tiC%M=9HUD)>oDzGE~=8uB}OZTh>`v@NW@%r3qbJDMV zW!7H104`Km?BJCyBChzx4{oM-J)P?f_*@jb_dG_Ax_3m^G^zO*x0TPWE1i)M3>9xwjbQ;*v^|Mow@7yk?bu(|ob9DKZdaWW_Rd;m zuR}%Ep)uvIF+~mzvG6U-_v|TTomDK#56GIUEfJ|6t!wiB4+x%=2oq|~a~lULzTj-d zcM;7}ezgtf8|fl(+-VBhZhW$TIP9<-RMVIisW8MT*q^k2N=~_bOk&FSv0vw;Dn-FQ zk1I*>-Z_n1-SX*=VQ zCyp~!k4+9v9zOV$Q$@GJ*su|7rsu4jmP6Pnkd+}sFbb?EH+z_Nc%OP{gIU}*osf^F z0o&}X&wNN7P&r3g>6!~=wikxshY*QqZYD3lkoVBkS`l$|1YZD+#sB(|cxYqy?ODrS zvAse<9~eAhwNZvRCf>9J|=|8`FX@r5wJ9$V9+EvzmUYbOP=9opebWW&!`{04BJcoA1 zojGrrIb* zWv=X)XrR$vo8<)GT=&)bB%$ISHKgZx>0*8Ds>7_mp_NvGI?u^v^iobdX36;_+nz8t zkARgBlx#*QJV3H7V;l(6VT+VNf|K4KzpJwHTwOoE>6?BMt#)XaYJ&;!#*V`uK$Zsv zv@9O@ zoNl3830Gg{Mz<;>d;BS1!7T!tDJla43L8v{8Em(GKY2!e^IMyAO}tO$G3Q9c!@(Xyo)8BU-4 z>MXS-GuA8>y?hPcj|VN8u8c$Jjjdtn$GaJExh2n>ivX8O47m43RM!B!_dL6eSpB4v zc&}z=4^qf9?3;ARAf`5PZ>oBiyyxg$Andd-Q$PF}2`k{2!WkDqRln7M=?~6-9UMO? zOvqmd-m}+7P|)`KjNc(utz&+N%DWgKRngZGDlaEph~d-W0;<6A2ZFq=Ot6uL0_+|% z^#(Ung#Q9fr*(OPVLZfE2$SrPy!Mm-rSY52`YE?bzE26^3sL0R`Cj|6i@wy$k&9dX zk@+a9@c^vIP2p$gU;=156e1%6?dT1OGOf1}+-jZfLevx!yF|iu09AX+bHgqLrV;+dUe_y} z`Y}VMRoZp-IX|yb-^Y^jJ##&$j+ail5C#pz8fz-ntv)Y-wNu_?5=cO;Z9~{eoN$Yr zlQ*>eZ1G0y$WOfHdhV*~FIVwWoa>RS>_K9_6Dlap9ntn++!EYw8L-?Q*y$D!K}MyA zIkEZB;OzSb6~=0n8&%zDCNc5_{spYu2xrB%lL3mtTD8Bc6we<_As-)dRZU|Tz07;! zT(klJcJ8g%R&+grDQ4G-+X8Ivlfn!&g#IH;l{=JE%@MfM%C# zw$T^VICb5RJQE)nCQ@B+6B+buRuy3mH@g*9U}(c0fFLRQtwm|*w@jZ}-IT++?bwtr zr8#NA#Zths&fL{mStRI@Ixq4gmsrqs!}BYlbRSlxp*GCP=9zY99DwUjF_e;W;mB9t zd<$5GGc+i`U)NOfS&F7?^~;%s>r$WZptnPRq3;TP~ds zStojFZt+8Uca$Y+a@>z+jW4IszxixjEH~|D^9^DZVT2PLOjhqcJHB-2=JfNd&~P2= z8mq+KzU-xhX#qPxMG#fC^gehCoZlCznkw$cfEN$)q1q{wTKG6?YIJK*t9}{GTZlMlz4|oJ&l7^xq-&W`EfQ9 zYBgXV2t(P!!DZg6ul5$j=6Ze=4Zo_oa>}>hLXdP<>Bs(?0TDU@S5x1K zo7OwYYDUc!BSn4@Yy+p6>^bh*oM_rc0DpdG& z22k?QTsf|qxYP*nVJEs6D_64&-NxB1eceu%`#~_z5DGCn+w@rbYUHHd`AQ4*EiR)@ zo^Brf=}DtN*7bjP2paw?CtDCau8+OI6Em?`3@sWIK&HQ$RqQ@A)yyAOQCo7L4zdxO zd2-EojJ_7>otpcI<1kxzI^zAO_bjtD#dZccfHBmZ@A*UpjapdRB7V4x?|z*c)KuN{8N7A*c8`+9OVB7)o$cG3fo=69@ z)C~X3K5aI3Cca}|fBY*(-L$lFPb(KkxLwEEu;wzX%Sn0*V=MatAnF(ib4JRgxvY}- zn6-Vb$9S5+N1456JRd zeQ-le{Gs|Q>Eh?B&@_P5A({*sK~eH&++?>wI|w8j%oT0)F+K7eCF5I27m#d>>fw**Ptg$$cYO%<127zH}!jx>-BZ7%>Jv7J7vVbPlvpkghTLn ztTgz`Y6LIT140e%KFSLSff74q)MAI3`j1>-r?x4HoOE5ef*-pO`mMw7v(t^u6+Vp3 z##@BD_TrxdKrkPw(0KrqyNSXWsd46Ryn`o!S^@}pWS`h&g;0ib2t_8`?EKHl$p^`I z7R?Zy{u9JboGj!4x}TyGX(fPl@WRmxtH#RJH~0*T{O2QD_MWCnL zs3%;Ucb}JFB=E=F$m4WC#KCTX;ft~{ep%gFxP2}X%NhgwYu zSJ3=`K3aY01vUQrf`MqKgpKh_*rQ{s!MOC^6T z^V>gH-w21HIUI4N+`S$Y^qhCr6&1QvC=n|2JsZE3^;(2OCX`$=0Q_h>L$NAJ8e&(;%+xv z`Kx}(tvacbCH}ehB6cOfMW z55TWMwkN^{P?Qo#P(vWe``yKN_TKl5-{&4qy|*=M5I?x+?~Fhj{XVQ+Xp*rc-L!9ByD*A4A$DffLw=h zvR7#eKwIK_7joUIc=T6~wOE7*zWaN$kr1GbR3J`&MUjNt;nj$)m=@86x-TNb+G-SuXzb488Iv@CU zr*Z9$?p=#`blhVUr|Rzbw-^L5sYzCq96D0WrlWKje>CZ#hoLJt~dE_Th3LtDaY*N>Df#3octL6lmH*8 z$et!_om1Ra&k8E`sT=&EUuW^Wt9Z@VKfPaRho+H(9HvrsqF#2Q3nK>JM1mXk7Y}6y zS$UgS-P7rRcEW^lh?{Y_=9AXmpZ;3Ev%SY}mhR6qjMP^4@^LV%TlfTvFxusfr{93b zwj)n+&aQC~l^P;~z5DQGpwbh17GQG*)=bkt?mjWA7)mAC$*>Kvpd7#s zY*5LauoxUa(OpagE!T|{#<);5$P(U-Nio`mk$!?G8#nv!_E|@B3d|`#Pv_a~NQp0F zT@W3fR^yZ{GwPvUlVT_;fAlR!CQ7I-Vnea~ix& zl%4V!oXg!$&#hAlKg&=K3=q_;I*l7Gk+hkV6x%0zWG(!e5rQ>N!LjzM(Qx1z18^CP z4ST;M=5P`PA74fYa=F*+lD_~pHdp)is!WEuuc(eC%RJhTQfZkF=n5s!g4i+;Xk~uf z^s@{kc<3=F#79oZOJ5_!xKj*)@SqU=3Z%OGXQem3co=+8CJ^9UV0XGit z0(4tKG+9zNdzlp43T*~Bj8K)=_s)~vLHlhjrpy0kklt6TjKH%N!KK`uUYL{zUj7aMV{TkWxhOp zd^OxiI1otM0{4f>`H=;R=>SB$OeIH@i=(!q%{pzL3$wj@qA4q38RuK& zy?zced4X@YiXDRkhCTQ%t2g(w*wRoJyw+uv)<-3<_8HJxF(>j_1Uvc)+R2nY5b3g{ zwRrvF8PD`t%4f`Y_Pezk&ew0HyuOT(W_7Xm;Mz{XJ;Abth{pvajU7KpUrqdlireK< zfhYX93oQSbPaZ#mEL z)P`S8NA*gb1P(c#dZBwZz&+wg%Aj&nnxJyprl!U#I9-5%(&juhTigi@_|FdF`Xv-{ z=-ZzU7k&O*9#NJtI;4E?bi~)>w>Bo$DHeu20+&d60X-i9Pt^1uR2lXItUOpzOOd=l zubNnykpxL&`?EY>mZJSAQJn}O$NKnAA8t&E3xb)aY?~4=3AqlW)zOl%fe&>up!tQ| z(oth`Er28j5)P0PxY8%w+L`A$0Nw7|0BHKfi7)?nxT>7~bc}c(dJJN8UkzLTAJA3u z5WZ(XqQH1Q)2AUgUt%tjb<7O9Fg~-)76N6)H^M4l;<`nQPAJ|Gg!uNF zCj7spZVa5#U zA%y10JFaieg)z!W-7Sg@pEx54HQ%#?uV$W%|JI89^5Xa@TljN5Qs#q=6}$`(nt8%x ziHkL?0#GEosIl%&n7svwcmamUgDeoK_9*Uxts~q34*Q&uoGc*qYSC?e3nGdmtDgE<%C|6su1bQX~~v|*~5No;A9a#%yau~362i3 zA7jEZ0yKTBJQfh8G@FI?cZ_7-Q1$*{H_&CUUEXqd6jpV&X_^65JNi@I2c?`97SAFW(_ zeKgSvk%9Fa0vP0IGTmkXNW!}^h>LyHfwAQ4G>ltO@3jnnhKzB8bcT|+lk*)@-7i;i zlpY`W)q8G>Yrzsg)0i#Fl;GMa8&I3a(!kB?YEd00FHc$#uPh{Pq4er!4Hr+NvW3?miBfd_f$u~RR!pIog> z*uBG26MOB}_9{U|gw7)?@fLYX#(FUW^^kLzLJZr5e^8?#t5xGCBI+j`s(k8ekH1ty z!d}7QtIvc4&H)M+;EOUcu+%DwY=v=JrQ2}d&UQHx>M!gGuT*6 zik{Yuk#~P;DtfRr3KB^NlhJVw&eZXPt?PJ$2Js+%lWs(C;~jgTct`m3IBLzL0)*P# zr9ni#K)P9&Y0piBPgfG@=Kw;U)yG6}6LoPsXd)%}O@&4IkH-3#fC@Jk-KovdJ<2*( z%(A^^JA;tA3K7Xbfu45`R^`Ph!?Q1f@s$B~Rddn_NEz$^QW1Au4s6~HXh;TPdbt(B za#2(r)RRMso?_x9#?e?S3G)ZDiQ2mXt&;h&^wzIN7B;6FJQT0K;2z^f;-+(P0F|}4 z3>HRpRs`Dt&@f4256AUDVs-VOf}Ra_5FN!QGIxi4KIvV)Cw7Na3$^E(yV?;+>@ctb zhyfCDdYKkphHEncCh^Jo^VpUH4676eG_kEkX-YA(+)U{C_--^ay28Fgb}imLFyyD{ zcMBe)U{=9~aH9;(gY6H{xQfkIY;8h&Fbhd_S;t4aGwRYx&XT0_sx-!S9Wi{l)3E%j zx`W{4yS$J}BQo0#fIRAYT2T`}!Bh*&!?vY6YA2$aPd#hPV`aBxbv`G}FOeqC9Cp9` zy)PzqqCMDC?@Ok}zUy|w#ABYNKJtF#joZ9a?vK($V|h#<~YzDc$DTr2p^ zR2sS*eePh{fs+38Z?Y9b=iR@z90M>7j}|e`tx-cH8}7hTn-S>nzv}(G>Z6|8G12Nm z{tHQ)?ReJ2I|FBTdLA-&h~64ovQ?k}Y6p71c&p4is_vVceHFjI1DQjnTO!xah0uU+?WO=SHh!a@aIZ2JQEYoB?0Cxw z@Wye_B%Ge)V|19|8YLD{gBN3YI(XH+8p_L5b~mg_%Jn*U=G19Bxum}2U6|n+0??Kg zI`KEHjs;c;8biJ+j^Z4h`HM#g<5pL_t1h{GUI3BazO+5BaME7DCdkXyrRRE=A@Q*e z4;1fRp-p+Snpg{uy1S)_ak_iLlA+g~0w3M_~r{<*FAR(cWdzVBPd$OVima z8#^&q!kQtdgHl`Orsu%_;yJ5X$^`Qf3=s7t)+(q!0mWbI*IyI5L8-~VqFmV%K!#lT z6BZ5C?5dt6Bs}~A-DC+G%L3!xK)0M!nQ`!W_16ECHuGq$hWXNi2|yFsUu~|u zs5F(UK|wr#(a(XBz>rItw zPk;BE6XIHNpWr-^q>`EG(fF1fAopWH#Gi6lv-q|^xFPDfj zhfn8{0m5<<)RrsF;9po+Z82np&x8)YU=??*mqfhzRB?1S|K#kE%hiYSl0VE|wP)NH z>q}zk#~!JGwsC?OVsVc8jQ;PPv@?EBG?bs%@A3Y*+vY$^=p>Ey^igT*6ZrJ&LoG4v zyHF=qg@uz>>&DS@vb~w^H1V+)*$C ziHyznv%APuc#fn7m9RL<#kJj6>nd+nN+)Cz8~iw&?wZBAsSA`9WeR)q%BrCr4s$OpPk1%A|f zTB@IJ>)_l2W;KLg=Le(nw)F*6T|ZbMDRKW@5FZ!~VJ;@_&v9*_SN&ZMw~A|Xl&U~q zd?`FGefVXy`H*;t%z^3?AJr5r1C*cQ=gty=v73I1qa(z?tF##!iTk>W4AzzsWmu(M zGRvjt?9TNtTH|LF5`KK&9ihEnX_u68-oLl6mO$n`-VM*fZ^gYE%~b_2%MXK|(n{A| z3A0h{u$9{HF~r3Oz|AT??#o@^77~55!|+wwZC0Vohul291)L}v(UCqI+SfZUo!lKh zo8llRDDN$Ne0#}Z=S!^2=HL?_Vp&leBniG?63{Q;#&4N&OKQXf<{{xBL*o-4hp+#B z7B(AduB=n^K}qGbWzFL@CB86ILH6Ck{~6@w`8t1YKkwQL2n0X2gL+*@W{lr3@4 z5JCTjD!)7qU;3?J1gR#_)Xb(}X>T58?pS`8P*%9l_Tk2+?;_B61gY>iW+Ld078mFf zzFH+-QEssO$}p-1C*zD9FS8{jCKs2U*)`E0T6W@-B|!Jh8w7ahgu7}nR+&P;83U2G z64#^j(Kg(~#Q`%q>Y?GjgPFe(`fu$scS;*HY#+DMtv3*nb>HQ-*Ys*(tL*5ifQj9a z2E<$VL?|Q(th4YXAlwxK#^WkCXGy85;p4BI17u^rkqL!P(wR!DI?uKa=R4PhZ%CVM z!(K4KfZXI};NO82`WZ$+&&bnt`%$I8_-y1p`X0!%=qyt7Dz?^s^$46(Di0pe<@~*;TNuA{Fr*2$d z+}kLVLPET#GzgJN-Mx}UiCP+TOEFJVSW|w1m}~hT_yMp2ynxY7mnSq%a_dGRp$)V4 zp(Sy(OFLAw^iIB|;f<-b7p8F%eZB#Kn@4``R`pg%wwLxe`Gcag-oq3GQ19u17;+83 zhX=BKd=wSwUL7v`hvr2P2+ul)ApRUFo4nW|rwo}LmO{sBx@Fr9AKaAH!kK)s?k|dZ zhf{J@2QP6EJCX}_tKkoDlqy3P`jUv{^eRVLdcBxg)CJ`-lKr0N8>@|r^pI|51U?`9 z`c+CKp?MTMDE~3=V<%`F0-@Hhcp!=+1or5gyFXs}+-yEp8 z?$-A*>BNSoN*S6il|Q8k^WCvWBi`XO;q)77i&AV5+io)fb-xlwAcZY&5me%B>)78D zeY>I7MSrJmX~OvvdMQ&yS+hr;dw6@CrXPNO>1dlX8$iw6wQDDgFEg?9 zdM2Q$=NOcGhulqwnkb`To?QP`+UJwq54M|5*Q@d=@~KbnpH#-3`DNV?#Gjtvft(|Q z5e4Av;^7_mYD7{E?iHHU(H4?*ZXHG1o#FAd@?9w`M?OKY*JCZ5rdYey2Y{4|*fxAE zYEfVs9w$w#?THv|7a>f|g-9R&Ev(U%0{orLWsXC4(l()xEp`)mggM&=7>uQ1!b+V! zfuf8?cPJc3zG@f_2=kx%I%PU&WT0^%xU&0l(gicgTelUGevWga4K3L?&Oz222Duyd zO1mB8{IP?s;v<+#S_w;UDt!D)%x-+_*QNJLWYSbsFF&$S4N(m?*!&VT0`lr0u_`>l zcIWKo7B`Eq0a4;fET27k7fNbsF3TfksNnUGRYL8zK8LOw>8Z6Jv!)z^c_pKDy}cI0^3$WZ$_Zyf5MJyx>wdEBA_udK-~GbN`F z+wSwd+TMh% z-|sWU*afK%*XBWZc+PrH6d5{Xop9Dds;4R-zOBBUVN*&jQ96<8dy72`xr7% zR{|0QS(FJqz$w>PgD&9ccHCug+#K7QFeXMyf~+#A+T6HZL@34LZ{+uN1P5#{rwQyY z%AmHCS>SYGGG-CGYHcF*>Ds@;%%nKOxVB-~1dQGe6T?vy|KdpkS9c&G##@%xmltUW zuPfZF{ zPpcnnRoc;*&*{^Qe`23!16AX^a3nqoiiV)ub%39=xfrt%9Y(5k2GY0E)X7S^H%;g7 z*u|dlo(esb?Aq|sr!ly?7*R0KWK21_n11Zv{1%WL1=c1%j+asj+u9~?M+N@p08^D) z4ED<~aYzLk2pJmp1xy%vCmd&-=q{ZhTb#H@CJ$~QO`E@KJ9GB_qM7aEorjvntY5)2 zu&s)^AOH5QFThCyOgN*4=uW?Drrp&qA<=ttuDEyC8~T}^_iqW}{>ME#Pk1^XV;6Je zmZu4l^TaBA9Q?Xi^eZ!vaXQZGxXid*mU8F47G`*xb%xS=Qi|rpjwxm-Uz@VsQ(PNx zl&3R+vus9lXK1wJ72sEUHI*29=p&WW1o561lgc|Lz3b;OH7;-OT+-{l^Hwt=u;%-! z6VL|I-o^pM6%_3UxgwLd2?9Tuj(P=0c)w_<+e&%$$c-JCJ<+SnJ74oZk=+S#fDs7$ zgnOro^G33F)S|fJe!rN+gT(1t*h&nZBm;H&0~35EEAzT;5K!0_TQfl}0!QT9LI-}` zSPGk9_g$MK&IeiBv5(=#j{zc(M_tGQoA!WzA7TIVHx^u}%^&wo99g5rjX8#g;s_gX zJK%V|QwP+yY}H&~_J=W2(?q%xa_vk8O%P-O&KCm~`GqgU=_L3{ENmPkK8hpZzsUSN z*rb0e;pY8oh-us(EVKe>7}&S1xjO(zW=Bt71KC@_mcbO$UY4j^p>LPk|M3XZEf?6>Y+v2&?^ zr01srSQL(fUReX^?=alA4Z6e5!9N@u{ncw?J}`@d6oGHW{}V*UfT!QL0Zff4_`@;6 zTuLktt_A;X$Qgpk+pB22f$5=X_@TZ z=wb?)R?g;goXg!ltmW|;PT8H^O}bapxUKGOC(`)%^RXDcb}xSXR4$lnA_u@G7Vb@E>iXf=bu`wsSWbW_oZO}zb=ROB`LI!s5Y?0c*DT8TY;(o~1gv*meu> zljo{Nb*1!IxXe?T!nQRR>UG+)4eRb4MQT(m{a*L_rxFk^2NeSCG6IR8ED ze5KWnwSu6pu6b7K2}6&I zV!w1N?q+A5`P&4q;OO?+>t<@PaC$`?2;5x8b$kU*$1CKoM1&-c;vxft(@WQ{Q0kBTg;%$F)&R|w{+~{OQ|Lsx9ctYS6$2tG8dgG$}|LLj!Z}k*1nMij=vebzj$-~SjFu{V*r4%n3KU-k| zztvDX^j?ugAT^Y{T{Tanx+6rcJ5uV^eZ1cNc-TSrqu+~gXg$sfn5skYmB;}YZp`i^ z@Y^3sUGQBvJ!f%?aC;V`1BTBA;_l5j+7D3${7i6zT&lW%-kCIv%XZO{`&;c8bj zfrUhYQ>DNO4_OnCups*&<~E~vF|-2h)eVUabIhc}r!z4pQ=Nf|VW~xmSL|2&XAhkU zWOjF{PQRuSTeLU*#q&0~05_UIgyI~b8vqu9TmjJ+@BmkEz&_12&NeAEo-nIiL4kUp zKr9GK*JUhDP~i{YwSZ>xQl~+#SaZE!Ooay{e!Io<-c6NGV`~0t+bwze1dWy^OCu99 z5&7U%ONjMRw3i@hInc73<-+*cvARxtV95Jz9F6KR7>Z1Hz9MqK_R8~lOLVl`G3~cc zEDkyvLcnshfq0@j0)!%kv={{)$SRNLO{i(uIM!0VR$Jb;(SyuE)UKu5!Nzy=)UWFJ zpYaNQf6eIpO0ESrn573Q$54yE>Gj4eH9Eq$mQIeq82F~&4aoM)-V-Lt={qd$FclMW z(;?=nRnUv3$#?6p0f0dRF1V^2BM$0A#KM2&wiBNp`!h$>a8!Gk6X5x`HQ!j1ZM9=` zaV&Gg+C;8UHLY2(!ze@dcT8~f#Viyz*chI+5-SY z{+w|HeuGn0?puJmsXF~15tX7BzK+4kp6ik2hQW1=ncFF^&Nx5amU{na>&x;A|7ES2 z=IUgRNE^|_hli?Oy#?eHO3&3d-PO>x82ROYck+vO$c-=U<*%B>HDhIY0@IYeHU~OB z=KYDFMX+_ymzaIrY|JJ0Vf4&}rlA^wRL3?M8QbGVgS$F(Z;9kyt-AK4^&3L!2YeG& z!kfqgZABYQ+eI#UHV-#T<;{^q%DX<0kJ;azxLt3S>TUC-4}ctabMFiNc2wV}vi$>z zNiK}F-;Acl4uF8qLU#go{u(@U(j(h=0PR z(~u&MDU-`PcQqLIBlKUg4D1O=r~~*`IV{jq{01y*FX+h^QC8{nEE^SW=;hw1(SK+~ zRebN^o;uTdA1MnVW7Ad(McG>&@R;l#_n*UOK@X+k+CqM59hflw2;2gEpu@OB=Od?y zHY!_|8mr!V`HacQl9FdUJUqM)?Ebl<<6^-TZk=KW&XKFqBPh!(^_01rmH|VA9Fm^#q<$3Af=^e$ z>Gqwq3ECh+38|)BxoT1qI=Uzem6toTVwlRZ8uGvFuEM(^dkf{HuFw!#(uh?ZJt9@aa-=-2m+Y z+Zo%Ft2797?SbwVJ{qI*41M*B)z#sy@)_AH4RqRj;gzYjeM41SkBGo5%zEQ#J*zO} zhaQfk3?xWkgMdsMKqC219j1iPWGYB3ccu~DYg}(nq4#KfBO&&;RHDJPCL!KP-L%4Z z1+u13<3Z&|h9?z)=epS(lxX?kC#s>XG=_F3tTsJ0C@_$60%kKEO!q*U0os@v}CM@h^c%8N;ug0(VBPHd}zqD9x03x#bW`pbahr0)7CyhS6vF z7difLg?((6ducznelOlqq_(z3|K@(PUFt$5B|JProN|y}`WK1{Qsh^J`)XsXdPTu* zU~M-n?tl9E@56O4`oDYdlcNueQ{OzUMg-)F!{foV1Hf?FT>7P?1MKjhLbYQ&`oVLH2S7Vt$h z)NNp7`QyRcM{(>~N>pumzuc`XVY(=!u**feLng*Ov}~cWIY!CZdG{T|^1*XOH#hAy z-Er}f(no<`<0d!_P}Hk&&?-#<7J_b_^m*n1y+TN_N3cIx9HS%jl7{A zQPK)@%WH+bXB&;!ez~X-19R^ji`j0TZ@Qm%HG?Bn(Ea5wD-ak0GK)BakIH(p6ka}; zwK>&ce%I|~zaZ4^+r0Vddn*zW=TBACLxP!9vJ`q15?rSHb>|-awDh~&{W3i<x1-2Tl%?MKFTwSYt{m;4>9rN*RIz9N4+$#}o*zXzk9KsnIK9J%A}73kn4Kj8^fgri?apudud zrvJB&cU6xrbNBIbr3R5(z@Rx$`^Kvi52z}_9KT0b%o5);&;uR;Z6x!dKBap(M?J4v zz#TcCz|)AL>$|Y33^L8ei)wX#`9jn=RvymZGvlyfDN3mKp#97?^D{g=Qa85jd%zVP zM7~Q}oS06E#nX|9=$0eg0_;&V=p0ej4vN)Hq$sGiQ{4G!*IQpJl;T_fs_ET5P{9LG~aI^Zmgfd_i1iQ`O&HTv>Lu#qP5lRszQw7*7YrH}qG z#Wa6-vRp6kQNQQtLoWC2cwNFh^zRWsUp9cg6pO&uEpKQ*ZYgE#Lc|)0(LdyPQHh%u zWL96|Ui1`yIl^|SSrvRe@O9uu^v+4CiAQdt2npgl~NYfG@Q9yUBl0J~&orySUO_8nySJPJGh59Mg-NixCDg zq9PyRUdxfGu9M!m@`?7eqEDVOGKlYIe;&8kZL_WGYd~=9c0g`X`M*Sw5r_a386dOr zPhiX+$Z7taq~<@qk?#qdT>$l`jvXW}dXK_?T?I;(|2$}0gWaoE$<+aQ){+0nvux+UTq3FmoVzK{+PeKD#cX6 z-ov#47yH{TB*26$T7vZVubh{>Yf|iOfY3~x*nHT zIdaryc*$1NeDEMier>Ssf+-Y}M(^RR+swHO+UY?r*dbADEy6C&e4zY<2Dy&0$V!-P2?G2gs z>t5E)O`mM1YWk-3595;k3N8GV;{XUepd!TmMw~a0VK*0>tNPEQ+>%JYyZRY{D6f}O z9sXJq*B_`PTNegw*)DFNc(V1k6lWd_RNSh72n28^T*HB%>8J+-;f^&!^4KNl-5WfV z;q$n43B1hAUpxt44zc{$Tjt23MhXl51xSuDBOsMBF{f3uvXpv0=fDB^7Z+jfZ%RyA z`*#ZOIijpMY7h9it|3^;^@~Rt1v3MSmOi3!)JxOj)A!8S7el*Vf2v{oWPLeVFL5qT zEmQ4@dy@AN9v+QpP^r7jv}cI~(K{%sG!vtUrXEK^f-X|f#TV!MTJBeH-6u7D2$B6& z?29@|$6Y)yVT{@Wau8t~Q3NIMu({;T;NMJ`!&=q3qVc#JuksOIbfHlN@8YWN%-wt+#4)a-wwJ%O^7s?lbC5VM-&NJt8ImF5ptBTrUb9&2L8AZfPAVC2~QRa^QG zeVuoMr@Z`768m=x$|hgqyBS4_lFMv`*G~9lhxVbDHNQHzr5P9GfkAv|D@gLs#eV;d zkV79nJ-Ea%E}v)G-o@h8QiE<8GJ3X&UpvsN5PjMI{T4w1Wg+MmZaNw#it&IaKG@_676c^7N1v_A?Uc9ud}p;CAAuBW5#Ssh zgv2rXhLr}Jk3MtKdXD!H^fcddK2|v0{kL+eYxucMAT}blQF#N#dMcJ%KwNBQm2Sw^ ztwT>1{+6b{0;t=_MDJy`wh>rahy0B|R45vweqDpE1c|h;dKjSqsVnhiwHv>SV^fCi zut&)F+Vg$TXAkE1uO~wDuJ2C(56O3 zb5!Vk=NTvHH6Jl@K!hO(_d>Qj%b`{-CGlG1!M!WHPx9PwNxZTDAx@ZHKCTfcR`zmFeluoWRaHoo1=ESZeAU{_Q0b3KH1-9kDbqU4L6G` z*e_gQ`KR$WppQuD-`WYd8X`@Qy1b2}K3uNV(oZ>|b|J2#(dFqN%|WHH*;(#kGUV~| zbgx{c@PYmpJa89v!q-yfE>%{&cm+k`e_!njG;!~Kj~)5V=A5hQ}SFX%1U4YvEsqL8=W@e$lYTk zQCE9F?arI!1S36GsXZg|RqlaLj}u@kq0{z2tPE1a%=vcufsqk<1hed5)Xmp7|x zn#*T#R|&JVFpkt^@I#rf7h`SrBLyOBjc(4$v<&$FX*I$sp@2hMIC zKXb=Mr~|$SG))i9=Wn1x?7L8g#(#2rzx3k04I3f) z24u96K6oaa6}nND@@o|NzL>L7aiQKa3}3|2SD{hDTt&kI#`MvKwD#Lq8*+RiX5d$6 zeaK9`g}?LQkN$9QKUuIpjwLpC8f?Ig-~S20BBcHTa4Id*zAVv3)blScEuAhYiBGYZ z=273|yI;;P4p{Uhnu15{M(kF~Luq%|k===DAVo>%W4_qPA%ywCYR;E3j#*$OoLRvq zj`g(pmRBABi)U&n^Kp{q&3LMfkq^20p{(bl!PN*!8+APequHf-1yo@v{LV(oK9(~vTv@aXbS6O#Y6gqkFUQfJz)JIjNJm-% z&MgU0GJ|-%)uEOnFKr}9O%{lBe;FXf+5vXCWQ;q&fC2-w96EzY!0sP0bSqPftJUxzjx0a_bl{9mo3E{o=F)Pd zrMS$Ioks}dyO0KP8u%fV@J`f3DpTy91Qp)}-@&q^1cVcij&qO#-D8yL()D|7<$KnQ zb%V~^q3))c?I)j@>}%#sd8e|i^NCqxmWtj$r}j7u5(;8WK!T_s>ODj!wE3fMsJ0Xf z$@cyF#l@k0!JDwlqW0uz?HxO}^4;E(Z2Db`xk^g{s|7oX8OExAiSgqW0emz<2JJ;L z&GGSPq6FuTJ#D-7FkW>u~$ysYAS@7 zl0ESEP8|k_!HEF7)d?7S@LO4cxl0Npx~y5@1Xj2?!zGOn{fwbsM+);DvD45@RC5zg z$~94E4?bS&y1z{ZUI7G3mnXz#WEQL0dftGLDcj)+D}mwODV@9hW59Sawnt?)$@aHI7^l!;ebn zCbCqSnKStK7S+!fr}AEc@2k4JvM>F(+wKXU_U`oZx>sWK390PfFbXg zT^RO5mKO*sPgjlPDsjt*0p!?KNRqCY_i}0sE!QgTx+mN`cKBRv@?{s7GLv0Sk;yJ0 z7O8D*ShSgd9n2W)U~Nfc3H*ZAKA@Y`!ttvOD`+ zbDCb}-V~?T{=Z81#p8c4Q%lYb^Pev0vzdt6ox#WE%WuJO)4u01U*1dXE#2?0`2Qlf-Q1mf<{*LKqnM(KVtBA-onT zk1qUCZ-;R6!IsNM=N+oQagN9KI_JLPXZA9(FM}gLYSiRMA&-xIZ~tf(^s;B>v)>U< z|K#2y_MBQzNJ{HI!tF9sdEKcCUybC*5GfgvsOF&Q0g_(_d3&wG0hxxlj`gJQ#oH{x zdnH_oYF)fJ;Q(3dj`h!b-zZTNfIN&mQ85Dvsj%gL5Nm9!3)}J*5FsJYed}xqvrauh z-mLc~F*lOrOex4&mkI0@zWP}&njAv_5cGMi;ecQCpECA{mu3m5L{zX?F;9)FE;vmJ94ls4JMbL3sh;jkn5 z|M`wpiX{Q;E-2m7x{YVc)+TXfYrForN##MWg19`W?-#+LkSwj9j-;_BX5^==?-WFM zvs@rmxPQO!jK4_t^Of`Qswt|b@f5f7vd1F!oL!522`XEgchXaVFPexI!nGNI>U$R# zjNmjm+VyztQ0>;m1I=qP=FP$B)*8M0Qq1EsblzgRkAIdtOk7{bz1p0)^<83e8?X?WM8`GltuLkdn|F3lr1nYTy$>_3MiRJj3x|p z97pa1x)zTP?->FLl?Hjqbq81Y)-rAR%_rJq39U)2{a~5xK(dsIITHTZt=RTD1DI%3 zXOcuuk?{-Cyv)VkqBm4jQ^?mDanW5G(gRrq69WO&e>(A(*aiul4pvvale4^+%2u6Cv%W;HZZ zYQp&)>hI$$9s?ja{J&JW24$2+;v@_b+e{=c#{;L3=l_il9sggbT0p*n)dET=?M@_H zS`WunP9cKIs(<8<>hc;S#<)xQv5+m?6Uoal1r4)`{@FvWN$SBXqVer`KF(o+6@@H_ zHs}EHv@5an3DZc+(sB1~fH}4xIknO3OtQ5;(IT09X%P98Ts68FLqPzFo(Gs)KfKa6zF6x!~- ztumhoH-ZKmQGh(40}`&D5gP~Qy(__tlAb`-j?%w4zgp)zJZbDD+&RG9d>{@LqO@bz z;TEuHfbOq`2bln3VbBF?p+lLR6w{t2Z=MVF;oTf zdEt_YvrXrUzr`4=opuhmSozMnV5{Uywgx1+nbpAXYIWVo(xI;pcijaM!iSI^!_tSm z-OwptH+gbwjJii&j>6xLkejUJHAg9L;lEktEcWAWEXR(U2>8U1RjVC9jjlzOTZ+57 z9Pg_2$xu%Tp$F7{qKUP?Ty9(L~oO@KLVXZoibkcA%0-e;=q7e>>TGK&~Pk&k( zHqEBon(Vz&eWcDO{DWbKbg)eOgQlIWijR3Wg7O1!j(qYAY9~vWY0PpjL(AE{84UQ! zD#~yFX@8zMr0dytK}Fc~GVNV>VkX~E<1*_=Qje=43&jQz>@K1RDyzf3jun+B7b0#U ze)xLB9%UV=TN2y(UKp11V1HX9-vg2>4M=bwwRB)ysRvgl*0~(ta%i+gAAQ*P4dbG( z&U}(rKFOkZcXRqEqAJmF!YSWmcfAmW)BT0U9VdbvJ&@~!aikA3Rp|Jh<%u5Z_vjfb zl!bJoYQi)dzgTW0ag~PR7TnumCgT`CbY`jN+XJb0-LYFqyxgOMNKpQMaycF^A4H5K zO5;3WNhCHJZ_kpM(I5zQymUUhGG*x>;HqNxWhq#xm3kns`rG#azZvBCgcCi9BYmmI z^(eOx7%0HxoPxHa1SglfqC?w^HV+52tu&%)&;XH&KsZW#bs`6kKfc-dqj3KU9R(b1GM)j)cU1NBBLKvp=-- zIG(b@tvzfJ(SsAjh_EkW$AGXsid4@u8#3DCrzpt+;UU2-nJ-u=%jbz%TIY}Z-Of}R zu9+cvG$MD3k)H0^a%7XD=#aD6LdzeDRo3g7*4!tC!fY64XEh5(U$FjM>vHHFb^eEo zU&!8--(Q=Oi*Bwwv?3ptzjjSj-!3>CX&E zkQaEgrQ96&Vh;KifI(a(43_%#!;#R*_hhC5*EsIdERvlstnc>ntN zzf;N=D*#N&b?ia37!wC3X%5sw+zF{q_IgkTM4kN%veMpPu( zzA=kMCK+@d6_w1#xnI4qxbIj#?48)R#4U!QfeZvirbA2^?cN!3_U(FlMj?shObd(> zVwe{30ce9W3ep^{K`*x|^0A628Us8Wt3gdevpnDZFGHTsrw>k%&X=^^PuXdc-0HD; zQ}Jd#QXKEfcaB5|WIaWffO-cah`+{B$cM;kfgZ6)BHaQosS#*mfa;2p^lq!X?Ry$^Hk~dpRIyl_QCojZCr5IQPm@K*(8lf3(j?o|A}13s z4-YT#Hh+KrO%D^*3r_A(I>T!%$!`N7UKJA5o9edd1lBi!@liD&#ncAh1=tV3B*10oLRN8NHKD@G z0<7HiR>QFNJPjv0BFxg0&~bRh=&^wza|#pXxos-In1)7C~O+&97&m5Z$5^N^9smd-dg&pwY2HX z=2r)=8Wz9vKOUHH3*I`9Da`g8m+EZbD8@PR6qBW|GzsXH4Yz zFYqX+zuG{CiZ4S#WVSF%4kY906K1TKnLZJCmy!73%*OzXE>XW)?{oC+7_G%mHL2MT z6V6#FT4#P1G&?Q;2aYMTqO#17SnHf?@UcSoc)u~#X0krZG(YY;skSGpfAhOTMH#^( zZTmy7*)2&wVETCAXANVk*fpq0LgpT%|0Lq>__j z%ARB@J7uzmBunMJG6Zr5}+jAKVoU72|IAAD3cd?9UzR#GT z&|6`PQ=y_)vOMA7>@WI(o&6i%Ci!I_Vy9R{dHSiwuerP3NC{m=UcED_>J$2%>ezA$ z23B?ozk;P$ElRomvO?{`feaMxg>7jyv8Mgu-jwYa>9h7a!|A* z=b^_@a_aQ&uM`yMx180ilsj}kYpI0o=;JU51mP zy}d=%5HVD6Z<}^jJE`HY%Vdi8f!X$^zQJ|TOJJbkxVHkG-NY^hw>{O_$h0~fs9Lb({?TIJL%+?KX@JFfVaDd5w4u#aGMiD*|a1C1FVUEEf z0)sV^xLRV zI2bNcL>*PP6~_h)+eROJ+5UC>1izclnkiRaYEgYsVQ5pn){=Y7W$n5jj2!bqj2w!t zf9c=j8)YvT!K1XnV#}HXJwUYrd(HzONcDGdpq~1im;#LmreiWOv;*J^w@9c%Gey2Z zY*lae@)U5c*x(6j0VWU!BrtR%ZX1lwjdw61S6>9i@5 z)`D)fo+*Yac(XSPZQG@V@U>B-aN+~a%bU)WT3*${ZDVJ8<&NlBD?fd6%h~kQ*S)uR z)ufkZKRT-T`Y{{t;Q9m%JNQ=GJ$mOxe`SuGGCe0sO5pekZVw(YC!uB{toj|0_pd=Z z9r`!Y6UHu6`CqDcj#wh{ue8*kFMXXv?uO)e4P9~t9!m5m)syU5#cpF ztlPFdgGfVpD(IM1C4Qd$^#J`k_k{z2Ux5CYlg^jr0hoR^(-P`FPj>{9V7{?8x0_N; zk7C~P%&)Y8F;Rk!KPNnLBHb4)j#i!!!=&98rFmd3Rk zJn4I5U&US|Oy-el$xIaq!mL6xM)w0;$JhtY0P`0N4^f3+b&soKx@;lplou!z3bJ$t z*xN^DmpQ*UbCAN7k4HWH z{7rausr~zl^IiTBZv7zCBl5f*RXU0lUDvRAm@Ex272VE^=#G;(OZo)0F8qai)=1Q| zo)6fqE-%Mgs&kwy6zoq-7HwWCr+RhZSArrr z(Q8+h=b@7UGJ4~0w)#HW7}`WNi}Js+J8SH+ce!Ih$W`V+sG%MlyH57i3lm5jR*zYi ziQ|HW)36G%birGcq!k@N3%B%{J~&*J(VNy|AL#9HF7LDy)<1ri2yavb_vOU)_Z*eh ziKuvCqzlM=pTC!HdiNmM$G|1IR$5Pei@T%ab^bf18?73Z!tIan2FN!_ECZ!AK`&58 zJM@Lj;1CVYZhjmMj_5dRJsgvTug0u|youos+TKOzWd-A?8iNkIkWX!`x+1$xEYYQL z$20@tt)gF(H#kmARFkg|_{#O*gD+Om6RNh_MxWcIyz8EA$H|AJ0&UChQZ-w*`eLYC zUXQzEGJKQRNo)W?K~VS9}Zq-|h zPp|492ysRr6ae&PBy_wv2D79f#z9JN#$@eiYrmIWU(TWX5(fGK`U8*}N?%v=`qg#s z3?6a~S)l;8+ZGcJyVE9-fKRoj<@NVhX5w9-xmKwDOxlR1P2?1*5ZA;-~!!)T0l7D_64OSn=ad zj+12IdjeJ2prSj$l=;@omWpRvEp#>B@5-kd7SHrUb1}!YpTLO2?LYp=`Bh_PRC^}a zmvc$e`yD6Eflb*3#P2jtn_Uc88dssJNregpIdE0QtU}(SgYenP^;xz@)4MD%Z;AsY zh+;R5JB2=l&_s{a3A_Sm2{145BLPcd2m)89#RgMPWUY?iAUja_M(CMJW(B%uze_32 z& z!!H<`H5pci!QGvKh4z=1!`tX*tImK4*)j({wjPGuyd~<5Zz)<)u5HRM>080CUhy(| zopT9|#LB4}Kr>YUmgaN7`+9tRDUuFUxE}aBl_2>5MsAiVYJ^YZN?;!KV_$f6_; zqA6*Bo)LA0OtXeAIRIyd1grWZsT3^lAI?RrAWNZ-kzmE)Dsp%4UOHy1f<9Vx z!Lk-8yq-SnR)d)Rb*JhdyN<4Yc;oz+1Pg7=!zYi5txX897jZ&&plJsQU6!@85o3>O z*0tzmEKQP#CdmWK8i~~+pdQEvIJk@ZcCn1uHJ5Gj;a*3u@c)( z;M-`8Ydl7}P|t*_Vtdr9ugZ9ruWgNB#NH}u#Eu#&bPTm?@e*}4E6^9^Up^kcFbQ4K zC-YHYJ?g#KOS2lJ2%J)I=8lkbua3fHd!qQUHKJL}HCC)}qsf-r`=st)`}FcyNqTv> z)+wM$c>lZq%=x$a%n+#ara?oil1=8gp5_84F_Tw)R%QXx?8gz+G3#-s(Wyd9|855; zDRtuSzp0E`x-IY-dlLZe_8sB)VpufmZbYpXQ^Q1!!{yKSCoJ8;d58-zp#sock?sq@ zdU=RlIs`0V2B^)XG2+>%&U-8k*nk#6@e6mZfVf%esW95YEAeBO=bcR>j9L#a-DG>- zt1rU#I!mt++MQxM&ll4Sa;`x?_XF~#POkd^2rb_aSo7Ya4zaJ9aM9|beQRRFJ8i-Z zhnUu;I!|&YLy}SxL|HjSf>czj`fTD;&f2?SLNK6mq~*r?3~u}7os%n;?o^qWRFg7R zt?j}sYHZpN%lpRhqpOKs>}J3jFOAe=@(r$@WVzFOsl@OXnCvdK>-12OV$GjFrRm?IHbJ|u&v99{u1HAhKU|UN z{oG1}uTJF6)0yn+4P~aM)W=b=5b!xGw3sTA(5ylM=A|`Fl>z`BJLr?JfvL_UN;!#9M3)6-|y46rW;}ia&z%ln_Mu1hT5RPr&yh5exipnG%rUucLmyK5nuC zr^(L6txV6&$FGDN+A!t$!mp2GZC>vp%b*ih-&Cv4>Drqn_mK;_P`-|(3?HBqE1M{O z*UF_^an32b6!-Mt^W0alAKgS3*RF4AI|H)UwU*vrbA6snXnf3j5d->B)jm@|KWYWH z091c?`Q@|P+GifE z544|Q)C%HpOR>XbSl=13!-7nGS4|G|xrW$j8%}tZFcwX7t1Ci@A;P##w4!ih%X2xg zkGw9}JV}4sDRqh{=Din94+Cmvpy>j+ip0sctPTnShQJ^~m&89};iYMFC(c7Wo|Lrt z&CT%r=Wl2uCRx{rNU zJj&?1a;L{hv#Unnu4aYUvjCg0pqc%%6M!&Q7vGHHdEr9@6-_%zh)~rbP?wO!KY=ow zOtISreKq$GWgZ=SYh8$+lCM)uygcYJMq(NutquVl=zN}w(zM!Hiht<8_o#nFYsK@f#U7|Mh zEq!Mt|Kyj6^urMQZa^e-L})wgF*$Fsh(_SJS_Yy|q@> zIcc|NWBko8PI0b*`;~6AN+_Lmd-NxV3vlgUAbp$y%M(b9L!chPR(t*%)yh2XWe^!^ zblV;mXO)1>Z^Ex_VXXyVjn10nwfDwm@d< z3swhmQBjMg*^qA0R|-t={yS=9Op?c24=`-n0~*-H}|6d#X4aum5*ACW2+NZ9I=-FSAx%>f9l7L!i>;cXflU%a^u&GqT~?uf*Kh zX&W= zQDe|?($bAe?{PjfrfWXkA>pVxR}sN&mx%MJYw0Q;mJPMLechJ@8aaQ|3Ihx*6R^_w zL}i&esYkMs0m=b!UVP42ykH6KN^^){cDC7#ok|69(hBB^B9rU+_BRPX>p`b)r~!~^ zzbX=7?7P7Z?H!-vnwlEYO8OqaX8AH-_>a_Z;kEJPXIcUml)MSj-C5WLif|~uMV3z;aIkdkk zc))N;VbP1Cn{Oxe?nv9Gre17lsN!B7wC4I|Onn_oxS8<)21#LVK$*G@$iXJaCn*5E z1Y4?gO-s*K=t~cIcHP}aBME>i;fzY-26LIGMN{#gjti>rZF+4F9ehtk|9pD*++CD< zP%xvK4WfTa>IKyrRJSUHCuZXd49?i_tXuZIAl2X{D;|s8A!}5`9J`A@X>fbIQ+`L? zM-odon(-Ch%#>s;YX_gbk15iBnaY4KV1blr5!&t`IqiPzB2C&u10cQ#_tb%Y2+j(-1+5&rFLe}WI5&h1lSmk|?{ zyQ|?kud;9`Xv0QBT*#Urz}5-?FUw!$v7g|1o%P67sP3t%{hz&ZZAH&-&3G2%n~KRh zv*J~Pz-KFE-l8Mt;|5>QidZ@01GY>RfJy}s3!5L-sxT3NPgAj|WjN#3jrsDTd#;&! zWR7&udwP0nw<_D2Wk;4baOGam0>=LcOi-=uGFDH`y!I)d$(=acRjH?+FU77e?H-mY zx103seKiPTYlqBIyMkOMH3F zpb_gYRoTRQSI$x~bi03*U~MkT9B_p-4)TO!Dj*4#VLF^?5r6q*IMG9VJZ}^C(qnfI zuH=eqd-y+l~#`=0E{WoCN8lK(q;$E3OX{^9I84m z+UI+)*{NM3QnJGC(?ey=Ndk$)%>k=)2Qz%`3=A_M_&l|F#bH;AwI#(Z`HjJb!`JjF zHycINOE13`s1|(RiF1J^X^w+{Mcnc--VnC7q=(;r^3q4|o|@k=%UXk!h8*EnpBqDs zWH$0lCV*2g%PuiliGV}8;srmT&-E(ylk-xLrXvZDu&(P{pXo=qpNjj?Q*~T3;_FcH zy{7XAj&0<&n;3GEVKNax5To6ape3+xyDXdI+FnBJnn45J)&NYz9l(qM`%q8NlHz+X zy`M8J^S7RJw2OJ9?R?&K-|N({Qx7jvWWJUw|LUP7?BwuCNg3;;!Nfg9#n&g*ZH_g_ zeij+nI2P}ASKc(^myTsh#&$9)u{P4!P&@LDZaiU*~WeVGVN zmwzH1Fy2Ka3Brw4MaXJ;NLbYL)0s=7;90KQTosyn(3 z0|1#XrjvFUHwF`<9N)dZtEBe5G$k%BbjvjH4ZzwJRY34M$+7pK-v%`Xgl4%t%_Bk= zBM)1<7ANeW8(gjGIfOm5@r?wJq=>c|De#Sj-$vatTq{0*B#wQ5#Yy(CHg))ug;{#Q z|DcXr))<_a<2qQ{w()5GJ-gBqCv;qlt)A$HCB2>D*LS)tV zec)+>2ZhCj(>21c6Bh(} zut0zsx*ci&T~r>R^yCVvb*)m5r@f19Di)5+A$n{V5SnS1 z;7*qCFk)xQ1dzdQU0NM-kMcmM<;!oBpy;`l|9J?=Es`&>+qw!2TOPX%rZL&lUFx3| z&1&#`dicS{hh zS+XCXX}2&48q-=Jp7L1nyfCLPyV|l8&6Yur15nPDM8FC;iylz=fI?zWjEjIn&IW?n z(C+sAj8;K2NDoaH{tC{RMJ#hWL|sQ>;TQVTV!>WKiU1b*kY)M7lq2OJ_ zIW81&)213^Irt-6IS>P zzIjhDwU-AG0mq?kbk-py)@RK}%c=bfyb9hN8G8f-P4!qlayW;9+SVX0skdG ztL?dvgRsdqbx;ib^)>;bT$serWIuYLL-!j1t=VFYjM-w=4;XAEh3jEckz2C8T!B0B_72Cg>=aGBOG{C} zwnW}7mxNk~IN*I$9zr43#F40|>kfr(C9JpL`0gR|lRkHrZ-b~BLUUXyoO7VVDT$`K z!r#D=&PTHnFm2*u2fwcp`V@#Lo`7Az0e(pz1z))*G@}^{U%f{SWXpW7BeGN;g3MGY z@`tRH=3-AsheQ`OsGOM9n~250)}|nMIS2i?TIY0)xkN~n=}gl6+XN^0$Hhn)8`Sc? z%Anp%u*{lhWk^i$U?(E-4&;elP(Lo_&uM`toO*|NKm+Ui!pPHDvI2>A2ZU2MIVX34 zJ=w?%kC+0b@_Ek>B=3jSpgs0>9(}s#yX1L~kN$DBQE)t?O{fo9$vDGzkPpE65m{oQ zRIPDKdM4H38O@q;9_Gm+2!EDPP`lPUieJ-dpX&VP$dQaK12?zy$w^hJFAwxW2eOa0 znCs^7{6mF|_y3tf1|e1>SIc-+PR}8W)<%u6O2sveI#RE>mb3%1H$gNd6v`W_1z#xl z+g4*9rWq{%px6c4U+iL3ec?nW|FDu$q_5jsgId1AtyxOx_?Z^_bkE6{Z5vN`_GrdX zKWMHJs$$&eI_RwV+Gl*wDJ3RETx|KWF1|P5FltqhX}6(DfLwHE7Yxmy{U$}nT`@N0 zp#O#ORy`vn&5~jeg^03ta)J};2hjoJa+!8t<8>yI^&Tc!CqMcaz41uGWf1)0i@z@O zIyt#QoiSQL;4_h6ZMoxBt`LOX{mymR1nbpts^_l3X1B7218<#fHRG|Ql;+u(5LbjaU* z3(mkYmx0azAkngGLLURG&LUkHn%~6udio*MDjbR~Sxf4^HBcA%6?ZhVFK8>O;7h-3 zJ245NOVOZ5%y$I5p3c3V^emfJil zyDOFX&Bt7<^(i=&_J578Y5~H)^e>1fZtWbGx@*uq>j$p9+*qMC-tP13yQuVxOfrD-=w3lO_S$F@sw+9i(HJ z7z{V-C_Rm^WssEe>vaz=tPm_y8+<(rzW&hDr z1ROFy+Oh>Kr`~T~97w19Xebx7%Gqka`nTFYW%1wt=7c)VS0oEK6N$fhhzop4osEQo z*zq>z888^6T~1g`5UZt|E|}FpB(2=+j}o?@eo>Aw2R|VMvZDOwCn%s zH8dcL-})V~3(0`G0AT$r zMNNRb)x&m((x)XEG{I0}7r#J(fUNR%JI*>dmt_K9N^>e0ztuxN*_8BdpZ+eX0}U~) zEBSF5Iq0+X(PL{smD?Ub%`qhQEQXef2#+sAa6WTf6X4@v3bkv$S)rP=Smpqy!hIfK zFp7ZlU$Ke|=szqMx`=7*WGVIVuwYOeC8q;sO*B>E9eTKyjIgy}yRl=7_w|>XxVT@7 z=j$&d^!F!^7#^gHT*v{V^N1PzIwzX;%fw3T6i%DexzQgLiCXRNEb+W;>>N{?JK0Qj zs`Qw}H;10F&)JXj%Q6|j0r?9}mro6s4Z6dB&Z#AcFO@5VE8~&5xBaGf(WOdxd4+9b z15fj8HKl~z%u$uun|LVdGY>%WZ{Dc-MD>}31+MqVrv4dz^S9D5Kl>V_XR;df2oQ~J zK%s*V%{J_HoPLRE61svO!UB$;?xnGFT)%*tla`|Zvd&LFG)nD})NjK{xc(3PUNYkc zQl-Gu+}?$MM_?#_fo3&djv~AO1)%njeGeK-3~~d+F*L_eU=sfRP%W_~v)g`4r18wP z{r}|DSR$)7-Pr;X5;ye|)o*A{lccbPMIbFFC%WQl;TA-Kui^A(bFAfnVlb*rPt4Bc z#6O*g{2PhNznV8fBZ6hgl0Y21;J}?0E2iw>80si?3lBNjr)bqPp0fqJUX7{Gp32=# z7S7QJ?$+~-KVs(o+5ayrg$R(&o8We?Hwi`qov|9JywH(T+0z<=u9bM!q}mXp)721l zak}r2+1_nV7v7tm16-HdEgZ<)&&uWBdf&n_`-XhcBf=iwy@Z)Y z8O_<4d)cIi8@_CxRwQNYrV^@~2A=}_p3=YSm;O6H|DS3CZvUMMF*E?NMBETQVdh$- z-Nb;MNupeXL|BH@CZz%YDev-iaLAyQijOFB$6MEdrY8fpYhA7yEv>f4mAkDmvmFVI z5-ofM=>m#UntwA36d|}+1-6KRCBu;=e|C-wJ;%j`)&TXCn9k`ub4TN{=y+6qavH%5fF4kHl4g+G)z$0eJ}VLt$PE5aPt zMbsTod#N3rJ(zO+e4wEH&)$jLY#EbNtL>VQ_0F3gI8E*p1H};2>h$APW42Sv* z?ZDtpcc`Kh4a@;;+2ngP`~mgD6N8zZVrLtR&!o;|ARv0boC^&DCCD?B4be-{KP*E^ zPgEODI5lKBWi$W5&;7U(WUXu0tVwN%{d8~MgMXg47IMckHQA+VahqAbi`)E{a`R7} z=3LkiM8o&l<&ZbjbFTs_0HY_c1C9|d?EGl8W8NmBG@;8!;&j#g+5y2!Lh_bG%b0kl z^#@n3S+-`oo8O2<)CS*8|E@d==kE~TfDtYpEKvhc9N7n5p4E&Hiqr&sS^Tdq+#D_E zqq|>RH@K@kwP#0LgQ<^z_s0-Q+(n^19SQUF)S?pmFI~APUH=B@`oC){+9ge-IX<9X zaA0d22IfhZuDhe9WV0(-fc|1dsJTiA-zEkTnEzkRBMfkH+*!bnb#O&05)|~|3HoSl z8<-(5Om`SCB1i<~QjPQ1OY8|Ke3raVtXXx4T)<^!W`^lX{J#|iaw5_HWF+|hcj-j_ zJJtK*-}>uAbbexQmQ9j>2HQql4rH|l^m!_iJZ+gG{%E$O>$g89y>IVDnn(;AGCJ_KbTY-Rt-%beNcEKExWCegj@$?h% zA_*W`I4O|1sRAI#aGbTj!o7f11?}nXk%z@ZG&}jIE7d5W`ePc*I>0U(!33qQmi@E{ zL~x7*kDqTI4*{}v8HnIXd_P{(HWFRnQR$*ojjXY1UH9JZtM2HC+qI(-&e767L-Bi0 zrC1zEy>tm93srC8Gy!SNuH6fYcRO;S$;yjpQ}EBHAma+nZosbwclrV3dgW6QSs{ky z99K#vRfr|u4bgizi08do;WkOJF^HAoLCd z!za;zH9CrL1x*Wd2f#>xkqocmbmDw|1cUW>0x+xqk}-Nf3WC=ruvce6vMf;V6U`EN8X>#ZuF2Mx5*b`%@KoBmvUr zFtBXciV`sKo{x(S7MXWDu|M83cB%?l8v(Xh8fxJKzxh^yY{VV0#XVfy!#@kf;yGND zWDb z`9q6+VN=(Xa;2K_;P(i|3Rr0X>ds`y!P}U|aNom@7s8a6LZgvzh1)%fv}4FV~J2k-fp?e1Uy{+D+y@Qn*R!@lppf&KSb{^gj3l?_t-xAXh=?JH*a#~q7V|8_?X z)&rv(0#z@HNMQv!cV;7~6RytPK zCFVFxRz7(CN-PhrxP+vXw2G>lx`w8nzJcMP!$#)EEiA39ZBCqVa&~cbbNBGQ;CInK zAn?-FYY~xA(bsR>iHlE2OiE74%D$hIoA=;herZ{GMP=3FCr@A0H#9aix3so(_w@Gl z<6ggcJ2E;pJ~25pJwsR^eqQ{twEXoOWqsqvCiN$LG{IIY_fjgTJJBPx4PT`{tT<5|?6c60y7Cm;aq^^rc>7XN7?EIBsUU6kTf(qpq zOMl_$KgLk(e~P2OGxT?Uc4i>_Y%Jh}u?az75auiFeD-S?VFF%n&&a}X?m)hlPIT-* zrnKf^fbtr>bbuPT19{;*k7ww4OyYk)24mA>(H4+{XUoo34Ikm3QlHZJahlD5*MRqy z=>NrspQ^vwrd_Kjjp0@QU$CEa96qvZrsIAAhgk~bVwYu?eQAu;bBE_=mWw%qMT14R z)maKSWd0|A2zw{U6m*i@fy7ouvkPvkc(^hPH&d8LzHc*jAa};Giy9p)zvjIH^%d`i z`OF&w+xfukK+R`k*Hd+;tC(+G@1TAR2fzWweJFrN$9s43At4GT35EuJR=?yKXDt1J4)S&zHmvf3})(wGtqEp7KD z$(!szil3tJIXe(uvti=UHg6SU3r;TWylE;%ga%NS7#wGqbPV7?&6G!daE7hD-GRi5 zvrR8!BxzQeuw_m3&qY*sW<&G_Gxo_21nEjwe=a{{5sjNMxxNEA>a+vFYB5D`mEq7o z#O!J7Sb{|i^@djeABHOXS2XP9svq{*|Hr}pNW@vwq7=%%8te~5q_V91a83KeP*(qn z1~)t4|%R`Aq+TC zHoSSWD_o{?66$*jYFQC|M0eOi zG)BEN1@d3DYoRHI^gCRPYxYat99NmDJCM6)K{Ar*%SWz>I#@vdtM}6_JCMu)6d8tM1o+vno09f(gRy-$zp`7bf< zK=#@HvjSBA+cD_K|0c!I+F+xuFj_kdk%MSirZDn7x_+tWl_73U4X8JGvg^@}qK0d3 zdyKs6+7Pd`Lyx`-0A9<+7*Bp>piI$Z7eV=GEFqCDno2dJEX5#{ZV?B4PragedIm@b zf0+H`^HA?e**V!f{yN>bTc>Zo*DP%mF=gVIysk70gdp&QAjpd9GB@MJ^vLny zZWMpGh878ETZm3P>FsC{89PVxd(~bi)eYU7G!>!7ate0w@zo=4-oI-^YjaV1i6RFJxjAk zumZPf3bs!GR3}=5?in<%D;{kba%|z}$%p#8m71~1q0%lXHv{2=A&Ea@L|M$Gy~1Yq^$^X+AMPzU|FzXm`o#k(f}Ds z4PiV&3Vf@Pt0En)`D&C=rjm}!)ANxWJujVG{xZj0DJFI=;l%qANmgsj-17P7Bf-Ml zv9mKX{bH6#H)7h^D>oq;9z7@NnT$5r{8Jm6403diAK>$y?^y`Zq`n<>qh!?T?!wTO zW?V|19If#97`(5|zGER+Y(r_Px`p1VwJk-nMZojs1y{Zk(HT^6jc0~;e#kH`8Jp{7 z<-yL!SMYp^zg0pdUUtEGNi&bGqQZ0mwPun;8FM-xOH!c-zyyH;BF6yW+jChJT@AMo zxJOv#zY$Cs3JNl@{qnKi=3YZB%WMYrIg^F)M3GSko5!z-ApBcfJk!t8^(Pu?g4kpt z><*0B-YhbxWqWb`@jf#vk7l_BVLx;tjO<_!XiR1>*`ra^5->yc`=sSY_Q96Vet&LN z0u1UjCe{_Y9DVWLlbg+UG3z+;ta5te8dd~5|5Tm@ynOzFG-!g@`HV!hq#pyG0V7tyIMNyM0-oQ@wIZ=ysSfB!SN0v+a;six*(GUMc`t(8^Jv`mrYsVzB}*IkJfooM7#4mL#a=9cg(j)gUO3~ zqS!ZxuCtH$iHF{EjkG-=E!_`ImWufyg)syyb|A{x-DoNY$k&e|BPO`W1luP-Sxi)R zz(ye;odVjb#{7_QbRHowNk^~6Oa|!a_xuUEfkf@|@o~?+bPDyZc)a&FZf>`4Urdx! z(7HeLjUFp7vOguuS}ln)hxg8?k{Ep%)n$u>1;Xp`D>h__d9IaeVy5s%z;z)5TeU0% z^z=+#4Qktg(3PH_cc1BB5I;D0d3iwgqS7m0#}pQbV7A^@aF+1XRYbuacnXw!5}yc{ zG&#JGOEHa4Ox51>7KAh3O*Uu|KhJfPhTZ8~vRJp3t*-ag2_Fq|kl4?-yw}md;E8dI zq$q~J4)x3t3p$TWW?J;!6WAa@Kqs?zZTe+jhw9ifGg&+G!0}fU`J8iCwd=p-)}0d4 zX0Lm`-(2}c!Xs7wB;FSgnbe;_)@wO*{Zv-z!HE{61yeLcv8B99{M%k zasOcJJuSDf>b>q+-?@+QF{z*D+m(!^wbxeaUTh6fBuACT-sFx$Q<%rHu7s$bA*o9{ z5DC=$W4IjbQUN6bc4r6TKg3gYM=wBNv$?>v5Z0itAtu$dbbV)L9Lj$S zdtPu>i?9nzI!Z$Uz6(2$3^N^oh^!V&(lh1qp+OTD?qyCZ4A1om&ki&vS(=@mcVuZsnQ)Z)2IOm*P) zLP0j(Za?!bbhowBFE%P&7cz0FPq2RWz+S6vZ2IX_ z)FKIBtMn5f;g$AM`(;r6CB8y~$dCNCd+(OSoUPuMEXp>h7l)jB7CO)(z=iDY5MacD zM)1m<`hnb>*>3eXIua%T<%lMFk#AVZ4ZozF2pD}}(WR>1B()!{%P&z$gliyFTPI$?w4wloM0z$f15fp6 z57~iK44#_dU7E#`9du_!PBY?B7h6GNDU#3ZUB}fd&sa_QwHXAZyR2)SQG1}6$8gKO z?2R_qe{Z4#-;_7hoPqGYk$|hTKx&}6x}fVBL1>yR9l8Ul3cAF2Jc|7czq|dQ>$zIB72DM*3gM*KU z*Wtu~DB3NA00q;(2B>q96vD~5QWwFP1t|xx;p9cdferAUCa{|~uL}2Gw$5_veO}^N zb-?$sRBaWt2_RLWx~w9qp-J-W$;xE4Zs?snxJYfj22VP|ej)X6@o30{G#>G}Ipc>c zK74+j&`3;9fZfnC8`T(kpA_#+qJ~!D5NPL&VNF%H{3z%WEyW5Gw9@T~85IQn3DL_4bf%0;=8{O&SEfvj+oX20=tq1ZpD4)hAw& znI(^-eI`~t-5Rwjn{s6$asvu4h!5-h3=oaaIw}I)L_7lH7Tdj!2e$&sn~P;f(+)g~ zm2iA^Q)m#i7uwXE$rOynQh`(|3$RoLP`}R`cEdRW4cWSxJayyxf~1gR%kc@Or#$U` z3VWDediogA_etThR||0rdv=!^VG&2NC4!69#=Kx0Qy~CepP5eIWjY$jR2|(+*U{E; z-cWkc8vAzjXK)=~YGBS?D}~GF_dYX7lWpUB@Y8+@OAI58(jt)`V8Tp;WxVhT9ZG&l z$jj6hGuqNEL zZ1m#xb$LT6`{|2Lk~2qiV~%Z|ZHPmbGbeT+GHbAly3>g`Sc667ef@_fa%EnZ>ZaN& zKk3-D4CQ@^yA`t?7ktA7w=zt*f|_BYh62|qtKaRtRpGLLuN8%Fs=VXIOw*G)$gVRB+CLn8 zjC~5Mf4n?gtJIirahAXDVRj{6d{mCDG1`3c6u$?o<-2x~?oEMzMJE*9`SdccXi7K_kn%NfkgDGO{nYE3Lu zYR)n+Jk{5HA~NgBiTT=(BYPPtj5{b$jAE{(m#t5>X-_Z`BF#l=S1EvGp}v86BrQS%Q)zQXE$rlP_IT~B78p~xln znBLMYVUD!k96Yuvr*@*q&qi(LLhiA;IDY3xltLV4sj79XPU`dew9v+1`DsukF{UU?>Kpl2a+@pdZh6!k!d z`!~~EH+U#Y)EA68Gf>&OU^|eDu$Y;Ss;2;S0gl6n;O0@ggCtzS{HlAZW_Yqz7Zo3A zBrGf{Wqt|Xt*mmr3tEM$uc8Z!P!BS)nR`JEl7jiCdZT)7EXG~wL!O(|Rleh~k&|Z> z=lJ^Te#hCdb54i2q{IyBTw(Ua*g))9N$?H+u?u3xZf8@N0kDhk>1Kj1T|9e%s}DN^ zr_LI7Xp!*=?S4$hKI%;+p`9b=TM{}~y<|~iErGy!okAID>dBpJY59OTlZOJ@Pb=U}^FEMFpR&8(QlvT+8O=d1(5kt(Twa4O~g6ID=-U!&e(R_pDmIF&RkvK*(?b^s4}2a zRb>~*9lI}0b}LI~X9TCaNW7BDeose z1oR4nPIk=b1c%OvUH5(Mc>bPq)Bfdytq9*9_zN^?n0A&qf)-$c#pbSK3S{gZ#C~!t zJ;;db4qexzEg3t}ovE|^ZS`1*@TZvl$DU5_xsWD@ZyY=GMJ)48Bja}aQQ8USI9#a1 zh&g9;F&z^f{-f1IWqub2lr;P`-*g0Ay5_8vlf`sHXW29*dy^nx_F&;THjvK2sDfS0 zjEREPTamP`R|455$>~4x!{YR1-18TLFuOG4n-aF<)psBxijZpP1ET{OPi+{xF&vCC zCQBf`p{mCfI9CmzdwoQja=-8;>ze4I7zK!K&QFV5U9w5DWic#DEu{=r+&q*ngr!~{ z4~B(grb)L;w^F!z>|<{9W9RJ?fAXH_31igU(8Z8)ta}>;fur2T~tdu+N`dnHd%owby?!pPc?v*q`s^wRgC1Nb{LNWJt6wlV_B66mjqa zDi|GGwEN}A1u_(8fSqWU%Y0I%ds6%X88xcD@u$Y$57RO&wBj@K zQ0H~;_-Hc zq+9(0fX>0>k?)~A@^4|gfm|)ohLB@tIG!8)ynAZ)lf&yRrQmb+hs1s^ioze0J}jEB zmCi*6b>5Ij*wn@bj5d6?b&r&-1f4D_jH}FimH_MiGny10=Kn z%5R7RSiH6aNyhUuZc@+*?I)ktPtAU>d%wZWe18hrOK5L8WlaM@Pc!Vuw*9|G)fI{! zGXKzF7Oh1Z!fG4AKeh{pL}a!^N6_1IL_j|SOy0&-@&5Kcf+_7 z`ogL&Y!_%F9%g%(vmV(~f}9|v#4IG-RCa%WztuFcAXS*JUEreq_(;gs5Sf?{#@h4c zuoswg)Fo_;fjE>aw0N{b5rFm@*{8NEMP2)%bzxlZ$@sXMA4l8)OK*6X(&C}u5E#nA zq8rzVC);DD)qEHaQ5W42Je0d%x50Ge+>=Wk3Y46=2k@228#N-(w$Yf48Q(Z?)_$Z!)_=->Oio~eelQk604rg|yCciomsVcbK3NY1PwRe|i^H5mD`2TrcjTLY6}uZh}_&n=B3 zKBk(V+Xp$97w?N_1`6fC42~RvOnwjiZXxQm7cA;1#EUu{;rQGB$HLF|;y=J5+yb$` z3+I2G>)-?ZLnd;-M~6|3y+|a4#h>Z&Cl2ukdAKIiwXQ~J9Ssw&F8^3QoLngO;P_i9 zc)*a-SNIKwV;=d#~8HmZXKsgtyEK&4zV%Xo1{PQDK3(KI&w6=dvEU;YeQA z(I-LL2RuVrEst38#l3MzyNEr%3{KF-Y$y5!t`{Zlt6j+r4!>WVCs!sg$%!2Kp>jLpxb9hJP zw|g^}!hxN6>wd->6Kkpi6#j84>t+?#UZ_UgMTKtZmRwIC3iFErMEyT;~5|o_x zwL_|fT)r>)wA-zOFV|Wk_ix>N_mb#k_%P;>q|4un`m>MKRgVacbt z+hK%hr2N+?BY3O{lu-Q!Q__(8v@v|UWryjvn)|GvWgLAMTij1qj61+qJ(N?PNS*eXo0wc!Te&rFpIRgdH0l(p;L@3j(TROV;iF4|AKWgbRSKyte|Q!nSa@Vl zq-wj;yenMfl8JRmNrZZ4N=TKQ`dwh?eqNfBGcf|V)s@OrZD|> z4o+-**rL13Lex?6H()5sAL_T`e3<~fc?WVErHTEATlt%N`Sp%eLUp-=zP1qCU{CU0 zSmUO9GVV6MGVGbtzGFFo5Wx&RIdRUW{c!DZ^ZQ!-&qijg-H9iQ&CYElXi>kU1WwsO zc|nllMLIqSHD73^%t%FY^aKX(EfSEI81^lEXHb_^Vy(#Ex!=jz;e#-jFT4oL+765YGOzbjFp&)iq(@d?4jmKcvRLc4RAgTlWPc5?Q@zNkkoEGG6a&j4RVYl z43Wxcd%;!YI8qTg2CrWt70?cUWpbkX;ru41PdtNMq)D3}jXWqAZq+doXyRJ)EHvFk-3X5*2p2z^EtcD@3vA+4pMgs+87 z-5BaZngPxfC2w>u^#&?sZS~qLTJTcdmV3go;~CQzPmgzS)^|V(%{laa}M&bO7SFKtn+(nbG3YL?r=-)o6=XQug^X3&2aaSG>h_2 z7Ml$cE_>a2P{2%G75iS*yv{fc6n7(@?9ULjxZ8B4j^Z+^@%Ykf+EErhY$v?GXjlTFwsDxi5z}R zSzoQEl2+e~;P=6PpBn2Q?h|`%Jrjc9qzKZ*WqNlYBJKL;XjVWNp(-AsRPhi+P-q{< ztm&>bhJAX`N0rRixAogFc)3+&9L?5jr4?Js?1CCN+Syme-SUTl zn-qJ`?B3=M35wXqZRhvt)gP<2otpBv+57?55ZQx6L#LpktR}Fu5-{Eo{U`W$L zzK4_Dpwm!7Vcv=}x%mum{V^bTbmN)HVIs+{7VkXre-TftZ#`3_i3*e4Yw_+7BdxfZ+LCb-r#?9BR?y}F81i}+>A>k7)Ppb z3U2>EebKV4Z$NFS??X+N?LhY1v8?DhSnPql6JZLv>_F~U0$j{DE;(R1;?{ZWT}>ty zHNP3Qbs{VVwUQGE-;SEl+F~sh+!^~Hm7<;x=n%H~anq(_^1xcc!P{y4kEvB3EG}tX zW$qr^fuP@D?a5{TI4nOXkn$0Xcl2RP>tUmGFz5RFA=|(A$wp>6Ud)UYIq1WOUeG0R z5lMz!9%3z9YZ28`!6id}F86D{9O{0$m^{xvNf=53o%Fo!ZIn>N6S^9x7VCgJMU9&N z>UNWy`Oy))?FZ2xj4vfOrMG-dLiYumKA<>DHkXR6ppG`UZsh2a_JfcC8^Sxef*jVJ znE>amd9WzaJM&|E@Y|D3=^HjIN~2fCs71eezjq+thLa<(8F2oFyd6kUq>pG3!K&%2 z&TeMThv&{k!%8zftqchV`1>_F7<>V`f(%*L7#KMf#k4JBih&WFL?Saos}l-XfZD4v zx*d;N6bunq`e9*#GRBfIARFzlAK^X_5IWc1fo$ny0Nf+TV5OQ`Tc5zbY!v9W!I-ivh0Mhi3fqCaxrXWnZyyWdox^?6APm9{u5zE-%u3)} z=CkIl459xheZ!l)Af_unBR901lEREa5#ZqR5{+73Xa`+(90;DTdqaOm^kY}%Y3hen z)Nv=*jhA3s%MOIe*Run$5}OyKE8;-_2So9HL-nm=*!9Gw#H-tirPxCJZ#e&FnSKLh zdf&9c7(FZ|MqdS`_8Y!oza#rJhF(G{`N&Lqj9H5M=b0J)E!F=`(-k3G1cFwv)EV7E zKiUzxaa;JXD*0Mc6~(hOU_1S<8e(UGI`jc?3hW9@Hj@eKc+~(LrWnn^wxSY3%+1@f zr>~^%hPgwi)%U;f}$lo*57Gy9wvp zlKf+Q_JB3<78qfd{T9982PmuVz6aM_mgovkb|A29B${CLkHaI9|Mn*l|0rPBZv4Y( z)WZAbq&($R{3H%^_@!T-!l`|=<8HMGMRiM~H?|DFGDZJ!lyGj?6HQL2qBHHbQSs2f zA3^?G&Le$6@ty`t;5ApNQUIKwtu{~1EjkF)Q_l3}`UsI>i5MM@SFa{YB6|>%0{h;C zgg^(i(jX^ToeYj>O)+Fkv8)X{kOV0Cat-6@To7EEDe^V%z!hp!heTUjf-cq3T{ZcU zpks=CaW$U&1U5e}xUCE416{yfum|z;)678mCD;ULJ^N`x7D#?ncR8pJF|g%V2whP3 zj}bxBdJLMD4d~AxsNov`Xs>bcIMCBLkd!`}HCt_E&fU-=`!dDU=(3;*TYM*=n8FfJ zdX5!c(;hg)#!Q{;27QdO4s1&k9E-5o278Cx7>3`0{)QMB!4cKf3H{;IA+ZA?D1?KK z$LXf^Pr;gDrti<96gucHe@6+ZE!j)~(8HB(l0j?b;mc&6D4VfrxfPnZ0d4=`I*RdP9x#?lv7Emn<+sA2-dlxwmeFx*reXl-^?yUj zZ)DSud3pO2^b73zGNvo@pDFpBY)s26$hTTu8Yup@u%=s{rS(A-jmm))GBHi>CdJJ&j@MRw!Jsc!^viz2(Wqzc33B%9;ngf!Amvl$byWl8Zg#6 zwcUr>Q_^bqzS8#z36OSTW3G(-qMXT~i&pHAFL@oc!lQK5^ zdYIMNV7U=u^)0q*90*BQ^*kZW zK0FXtYx4EMx!l>+Vj}O$IBbiDd}1l!YtcwvN+2nW{-8S^qY^fs3#~IG**sNCqk9s#+f9jl zQ5!WTX9#&l28|(yzG`~Stf=6s6MkYI{eWkHvIa3%z0#$A=p{xil6~f?$#CgWB12&O zfbH4y+Ln4gujJ|`#JR%Qo*_*Qp}2yNcE=JVUrr3 zcgwUK**7=k<@-RV<`K`H98}x4gX32YW1dxSMKh{F%)3K#l&OU!O@b2qIB^c}ru?+` z-gvRAO(Ny2kH?TpqnFb42i@->5r;lKQnK211}_@|U2_GXOC>*_=n$bFA65avDLtJx z&r(pgH#z(Is>vLK)v~ur-=DlATm3=o)3|N4TiV{ouOf{V9BcxRwZj%-?*hd7qJJ`v zB=11H7nl(n)L}W7dcs!)bH$LdB*Il#e&xULhZ08|q6Mc9fiMTVLR zxHM%hKzZFfKTd1YErZUldmrTyNO^5NwNxe$?juO~&g9jOBI{E313?slA-mK*m}JM^ z03-8gcz3SCoo_}RWr3o#U7SBNM#Qg-9DeQ0ay7q^b0WUK;L)< z7eH@PIY1=g5;wg?7Q`RVNm4$|6;=-LPJcJ7OMGE4B9m3ug+;)O>dqH`UgO2uqPk9C zFJXn5{qVZA@RWngH;tXmDKB4d@i`OcnPnRaQ@g8=XZv#vg&{6d zZ^9R?J7a~&y;nFjtI|N@j6n+^!~!YM1;O-d+JZ;Nv~$--m`o;Lw9eUU8)5{VfiIl) z4uctS#>?}E1U%8RaBl^<2c+v5c$_>}$b=P0rJ zr)=y%#F(R~djH!9l`!I(6yg|pBCll5YtTOo75;RlqG8hDoo~MS=K~?v6(h71HB`72 zV*14S4t`XBW@p^nAv0lO3QcWDx19KHlP}n4b&LNsb&&JZ+}RwPV+Geg3hei>nupG0 zQ!fJ53tLPmVjqx5!0i4$G3WoVDzL4haqxrC<~X!?UcRwbjfh zM(zh&Nq(iT4Ih~UZ5R^UO)`v)`l(4s#uuXi}!Dfpq7t%2Ap0)C`Xx$Z1?R77D zWq|-!7LHwv1`}f%4|Bjr9Uxtq@m_sdF=cnPY&k!E+wINMA%RcB=8xVeobJ3Qyq`_h zHJ3RBZ8VHG0nr8aC9v_plX{slR~S*dY$lquK`N3OFKtDiY|Awr{$lIScZ_cEj-(cd zGj}TflyW-sQ|9OitqmO2kx|>R>nrRwd6N z)$NV?DekW2qdvvA2LOL+5hK4nkhp~ujW%A8NN9wMbiYq8pZ?Y~>a&(4{7u1zFDpiF zp1)Qg{LYDYUYNR7Cs!FF7R=$z#DQCMcM1$XiZHCOJl6Q7I!Y|3#W#lQ;X%dESrDmb zsCcwghY%v1jHjzYIZf;g20I-N2$g{x3QWPA3v_p&+dgbN20aUiz)eZaCMFiM)(2A6j!kOqAo$HZeVpx2UF30l&$3A+@`+vAV9MxwCO>XoMhqm0z=XDubcuCY(iyD;q_^ds_ z3VVmie+*n=jlh?G?$G?to@-~WF;6(78P~x?xrFxz?*d(+SDqKxv`s!HC%;0yN=RMo9x6&}b$_RoeTLu|0;u&^tw}9bQLe zd8edFbgRXzFSklc9C?d*EsxdQz|lnqdh^g^C+O^eEvP$g(UD)ykP`{#r=u|oA9LS1 z>&r|idLKOQFH4MXc_2ubeH|gmxKdYFH?neeY zztm&p#2AooNJYQ_ z{63Wd1{sY^6(G_16&V||;$qM-_Staz{+pK<_OtcaY8~9`pz9c!HgWn%#NdYzsQWPd zK2i>pOl)L!XTYg+?&yK%iEwgqL zwmhWR;r**;CMawM1!BH*F(@H1rk#sC)SKUzo||1;V5f=mY;J0#9FSa4owxKlo*MWF zRh~|Ju{%nsTC`TSYHbtohN4An0PM+kh)Z{{{+OiF9mvTslPLZwtFgrrp=0_$#3XRC9Gdz0|$Z z9{DUY1E?;AL#Gl2b-x~9J#-aG(QeYAf;0_H(KM0TZ%hxq`5WF5f_Y<*QHR6Kg2yI<-Hg}bpWf&1FM(6u)G9HQ!K zn2Jl7j%15#*Pax|N4_n}NWw5YJqk~DrJ-aHds2x*D`jN=KD4NbO5laC##bja_KeJr zx+vpKw7zVmU%d+HUnV$K0bX}4ZuM0lCz+g-u$8GDQRA&9HWA zM<+LTofSb*1j3h9wFXjO}ZgmoD$9lg*(8NnS-E)2_2H(#1iYREmarW1V+TSTFdn?2#|0cIOrUwORPm%;Flk?af zLDoPhpNVNrMQ#~Ud-u@7E+t1>)Aw1{mJjQcZ64Of{$#u#?}@~;fn|gcDW*7bP+pNy zg$jnU!4WF;A5<@dXnvvMonAgVxc72W$H2Nok&6&c{%X-FE^BHEDN-)8`TKxL1f`i^ z-vd4=>f2dlI#DvJkCOWIDkC{a(R%QDxU`hCo_bnw97o6bGFGfU2`qU$MQW3s+H_OF zPEx^`Ic?w^d84;ke4;HXB`h=q98jw4&`raGv+e-dN1;|16*zjIoal)iV6zCPMZ^=5|=3aMKUjJv0~qojks=uE0MC6jcIFeu}2jm-VB zc@l9rnxnG9_sP_Kk!icTZ(ry>h}-uz=d4xnR$f1v?1~o-K=Tn9r3gVXSHGDO;2Owo zD2++!{osC7*kE{XuFHmQ;Z1g(lSt&0;zgG|aV#kjifxHnut0T85;g!kTSdTkhQ*n2 z00pIA4d)JEfs&h*fpbE`*ZPw~^^FUiU96o>ygQeEJecQ7-<2cmpQX|BU>}SjWvwrg zsaOx-$omho%a`@UT^-?f?WGVt;3&Zfq;hPztc;_nsI%^X_p-PLT(SrIpzD8*`9 z(M3@Ubs&t;vzLAr`jZz-u0B8APVH)AERhn)d}kUd%?T!c*I6Pm z$tAg;KK5gNnrmH%=}8gIp4xls6t{`M_&SDUNAp3LFz$l+#jfboh020zv-pKw9&)O; zeBO=?`c_B*Hkfj*pZmoX%!Se@II%5I%5vuHKEkzCRc0aglJ{S3lS!} ztrllFD-T>rpST?)v;0+&D=oJCg3zewHzHClnhQk3dE41a=?BOscZHNi-{{|E)Gu+b zB7j7?YbBj`#f{*^pY5F!um=PFJb=3WiaBvD^#(2x8>~A;Je}B$YJl~~4iN^O2z$mu zEKla2DtVi%9m=Bl^1afqVYPXu{M(Gns|wGcHnDfE zO@?*K)StT#Woewh%V}llRjsALVLhLbvvDXz;Qp4>@-eTsU*f}kOhIr-5j7c&EsSjZ zV}c&L_uos(!4jf5?5sUay$#2_tBpF|NGGAXF6=-S70}Ps827!wl8Uu>C5rE04gK&9 z=!*sii_2&Oq!X&^GImh}FTDd<5efvK1Ee+raB@#4zOi}5yb=M{L}L4q!}icrz1-no z6Rp~1m)wQ2@Y!mD+rQfcp-j5&8NOE$B$5um?KyTn~#%D;<2lbUq zDL@#lV4!^zRjjCX(XIsKD8?s%4e7}?qL1EUrKn5ulK{OOj8c_ z`!DDxq+X5U%)TCV>BFgfJ>0(OkK*}Zhf6q!!nAZr=M@c9ByK_Z^X*&}VI5 zZkI%3EMAhaDZw?HrUxD4NuevN3+^?mdvYR8u2{}jmp##Z+Zijs-S_L487xuLH>rLt z6fmbHYA|I1_svXBy8k%I1csc^f7m(_LaN#8Bl)aiy?ItS_}%L|G3UM3Z2An5jKgAE zhGZO-mr)7>Tir4JaAC015SuI=dXGH2V3l;1Vwi+5e)?4x6{$mwYxA_tY_Pn?7w;^S z=69FnXH&`pQ|Z=yH}tR0qo>WOmVguu4Ai=uQ9(69)cE0$_H*TcZtsjqKJxa{?mQ2h z_OTjO%eMR(i~WPj;YSzymHD44pXXc=qQXdtPno?}k>f^cJzZo~XeL~mq{nWMujx56 z7*rlG{N}-8OUTi__`3%LKOM?^f7Spad1ljU1xJ&{BZr_3YdFk|{8F#OHwq$P{954& zZPO6`U6ttvG#1oHwrW-JOBZ(EBh9ACQsz<8*mKaC6g;(W0vjwiHMG!#p1)<@pRb>r zx?pPTHH>WvQSpXdj$Pmo>2x{1>!(c@?le#J(kag$x`l_sDl+e(=F{?{k~-v>AMuU4 zU9IgBe&hk0h8A*DqhOrg%beD_(a;FV+L1Fn>5eKJ!gGp_`n)Zj^N>RrKyhisou*5V z2DFGn$yjJR)R#y)VJ{zzF>kaNw$wNFHa_LKr5vzVduf+gahCgqT+4VDt9w(qD==Hw z`OGQ7KJ1G`k{HF7nn-ExO~ohW=k@VToiJ>C zx$YV8*236s@0AcPR+nJ;oNwqCsNSJz|9RcMwRjK*A|=y|15xZ9+JScR_vW#?Jp(^@ z1sc5~T2UQ?8*QV<(PQ(=* z8f*7Iw_s{$m##c&_j5XBR`Oz&Uyju@Maj>wY&Yv>Z7{P73oe*AXew3!5l#!Cdwigu zDWUJB$YE3TYIJ)#Ktm3CWjW~iI(NapweY@j0rYKJ$e}%1`xBNVZe7!Oud}_3C?gMn zJ!TK}p@1&B4=RYKA0{_Uh{FZ-29a|!cJjj|i7xr;pUOLq?+@{;x_B|LQ&`wWdk_Jx zDS5q2WdOu?5X@n?oS72x{f$K2>>3|X8^#epvAbSYlbHB}q<<`FWXQQCQ{>~kjdH2t z0dAMYKI8&mOFh&9KL0Cf1(p?>J%N_o|#$j~fD zbjp#YgD3ADpVVqq+Hw{-|9H#p)>UwRt}yO)umk2)J-~)`?9!au%Yl`CL438`9}VC zN5I&+d1Qv6Kr{L%$!VCB`lcTCX>Mw>LbHg3tw}DlkHk;lz9iCRrN|q<{6A_w4=Y3m zpUUDpHP(pcpGt1HNvn3NY#aTYAK+rum^0SS|BbaS?a5h)3Ga6>B#(iz{Cc!k-E2Qt z`@c)JN~B9M8s~j=N*H%4Jp-{pS?Wm3f)xtxi`d+(?dNpd|6%XV!=e8F{?Umf`%c!W zD9KhKg=Q)t`AA8~(o_->vXe1W3Rwq*PsCIRS;kJt#8{F__T89K_GN~QH?wqJpYQpd zbKm#(`#IM+_ql)9xqjEV?mwnWbItOa_xtsFu8+s#sc0EK#&qF)!N{;Huy?HXaK3Pk z)Tgv`wbcmS@qXjn?Q_uBY_HS9G1)U?Ng@}vWX#zK0O){)V>&Ph_VeE`kM7ud)u%{` z3T^4SQctyz{uxi(3Y)VyUq7K9eeJy?sk@Fe_=$XNiBCUlHv%Gsvlgl9OBi2IIPId( za6d4;J6D=iM7ZV&pfV~v3iULa8J!KkaknPpFsjv>zOL)5{3>RjseKweNy}gzPP?S& z(uR@(G&yVqBd_%C(2={N-7zd7!TRT#aj8MIb#Y&9jkX@bf|Y{(QFX!CimS`rEQuGy z8$EfQeClk*o39YJ3+IX*>;(IKgq1C>$tXV=pd$qj$|BwPBHfz zPoKZ%2H6$6%N1u7_Ql7bbxo50@Z=qpY@6A72|vnkd0li>Y9stYf=M{)N;7imEhw5c z@_?-ds2CCVLcUoIV_OChyN9X^bx}gjcSjX-k8fA^%LJR>>+KGLrFqnR{tat-F&yPL zB73muFx10TgC3YRW08176jRW@e6=am}HOn7C#3 z%#2~30u07UA$lMmVug`+yCbfvK|APo`zJ!MZ8g>-&WESYKiaa3Pr6Sp6UN;xZup^A!NJP*t5e^hB4MY*J`Nv3I2#IwAh~)$N>}o* z`wh-0pz)*xiYT$vX|>Dpv`~Z|%fCM$6lFs5s&3RT|5))xe#9+t#b*i9>^0e^rKP{5 zzIKE;Kqo9o`B1wfff@io4vl~dmS6mnlFo{ zDa*&W7$=2}8n|R&xrkHQM5!i4%!CIskmUfC=qCnt;dddUv`bujr^{p9!ct$FyYG8; zPx@1V)D73aLwd0R|akDl;J3DIN~#&`3Hop*g+Nk|t|m%GXJ@=lE9wZ%2Quizh~ zDd?Qdp6u>`D`7vkz;|Gm(jd}r*ac0`Z6(5NbO(N|4A35!2bzumVqBa9C%UKx--5&P zvi1PpjcvexAm99Zl>GmjzeCw%*7G9d_8O>~ot2uCn%%^%Sr`9?Eq=CQ@4j;S0bu!@ z7x_T7WwlY5~pdLL(Z7zOwAy1~Fk_*qg5 z23hmLIqdVX1$`x%J|EXu`K^@G`k@!B@1@?bJ#zjR8=|9F67ql)J?#S$#FiGie)Dgb zpwr#ARt9#ClCf$(#^C|9jvCocSJ`7Zw|&1Sqdd)C&n3+jQR<4PoKNm!9;p^9cye|Z zS^LiMrLp#L7X@44XTePw0f!k#VRRT%$rE0MZBe)Ik;jZUWD}>zVqabLN}1t@PiXxR zZ)beY3p(m@@XOwP6{p*RQPrk-Z$n%-0usz)ca`?OPy9AGOS-P!N682h%T2#nR&kE+ z<7D&2fz^0^rbsiHW{Drx2mT(X$vGHeNU?q7Qamy)(B+Xv_H!IRwBvS^^ua?fy#)F9 zCy|eA#po>q;{P4&Lu}P%w3m?@o!L^Obf~N{$a0tgxPTeP`7@InaL`TOqm z!U;0Tt}gdLVf%Jk3ob|nZ|zxKVy+2r868W=Rl+)<6$$N45(5rgz44ffynk46_KDeV zMx0dUo#Ic;eGvwBnP@EtNvoVnf8@p4!`eq<G;KplLIJ-LpEx}gxq&+`i+R+nupQNBrHA2Y5x{^nSTFK?zEKvf5+-Ajh&2e&} zVxl(s*OqptqrCKo0k@h&>a@?-AC81CHZFU2B&lz1Qf9%uk0kOT6vxoY&`k3)Qej2eNCER|2*fU;9R39af4`A2J)Lox))83@_HJrPPCsG8{bz(av|ky4 zP9udG9NOaJ%{Cl*%5LEB1J`nz9Epb{-eKwW)gigX=IhZc8QU2lB*PvGD@GlO?d_oE zN0w?G^cLUqZ24sHDJe6B?8E6S8MUQA!iZf9Qlj{_Tt_5JfG%-nsPTM_w!(sL!C=U! zF{l;_bc=D{mv=;@Zr^;vGN7xr!yomr4A~V`+QOdxcd6&APaSJhsqf8ug19B`ZRdM4 z^4|2%8~nfWSxW3Qq#N!bN_Z6^fmWuaCW2D!nYf8z9uI2W9h3MMXzI_kSjiRDy*8hP z$X`70*<_k60f*(n>;?N@AMHU|-R-&U^OVq}GEbY&maZtzHjLR27#ndB)i|oA*{tk4 z#%zKJDPZHg#qhFU>1L9LXDRrWQ`UoBR3n4TdE1PvHT?d`O{RKh+RyZ3@CmIgN!M>b zxo1|l$+W*XHB1q-6ck02;z+Z!zX$k;wyw~w8++A_R{c5dHPCgS3G7LYtA^Vz*|OBt zC&1>h!l_mw5UIUhDy{Hci1p7^-LRrRZO!j_s~F3h3y3|x^`zA7?g7isn`;7NE^9g* zzJYuaQ<&3}eFE%-K-T#WVN7<=52(9^bFj(8qFcNpSbCAJvm-{UDwBEHx>;5IJzR#voBd>L&=^kRu{5G#rZVIkPOGG}=hK?O(Qwer>v)peXPP+rM{-p4aW16FX&0l=TSsJ+o(~v$ zjW+wQ<+Kp_u&zEpl#zF771Xq%=?Vy$K_W$|XG5EBLF}&D+q-EQ30b+8v%{ZSMUEMj zo)OG{c!mlxQnzl<7^wyY-O<%nEDaC<-%Ysf8+B@e|+dC6R(HN{9CnIH8U$o1jLIL+0*ch48c ztEYK(pD4g^VLl@(@k<|E!*0?9TBMa^oy}X*{T$VG`^s!CH(e7rPukrm-EQjpvS(@H zyAwo1Yl00#nsTFVl5nhp-dJcO97l6!xk90|53}sj#%r(cW{d~>N%#L+#4Z)_)P%oL zQP+8S!fV9v{U@%bhXh@yn&v}s>%i?Yr!7uvJ$9oy^Lw~tSO`F^Wk$(2XHtw8t%j25DIa&W=td`RA0>-t5gGDB3)VWd@{Nv^?zVaqTe} zqfsRa7^6F8$KA$x`JPQ7OX!tXsAB+7%ZstoG?##2MYIK zut9YR)G2sYAQQUxFdGlPsj4G#Lu7|%D0N@S+#~N_^G_8*Q?$C=*GI^-b^pUo{aZ}M zJ)zfzIeF@_^i|d)2~I;BYq7diy@R;6(`tKFW}U#J814CYg2bqASeA@6gG|9L%Q9s+ z{RrK!_(v=u`l*s?_SNWZ#LN1xb+xq_6wcE9t;fPkr?0DTA1T$L%8&>!v;*yW4sDv6 z7UM!iiY(AF!zT2tYb}Z_p5km$)VP(k%Ox}F%a0`t;JEq#bWzjcv!;dlKyhiwTj!py zLalzLiwniYXW$p!v){k|4eQ>hWwWj5gs)y^93il=BQzH5Dp1Jg+S+3^tB3nq>)E*i zOG%`HTzBM%3GBgDZ72%I*-iKjI~TPM+Aw5s>{@RbL&+_ZoylJP$Ue>)|Na{`yFJF? z9G;Tyi-|K_c@zkBZH419z*$r9=FS^g@*!IltL7qAKNu?#rtH_Q=B22~3=R_AZn zQ+Hs{V}a`3c^@RLgbpZXnuq@WS%%Wh|6bmIU%h|Yw*Sxfo>O&0Rg+y9pUX?Lgqh>Q zGN*JBUx2=67_db>AOey44Vz1v#8MooRvZVrujHy#mZ+Uq^luYC}uHLoN0Ty0%cUi~{m=~k$B#(DNw?un~z-cpj4 zO3h~0;V{B8;!WZx`6IE`Y^j?<7}cgMS&zOgt#BaEuji-x<}ajq-)yK#D80$SuNEaHVULZKYZuW+ebY=%F1AZ z?N{TOG#r{m{Qu%8_o;xamYR^4#u)(G(mH!#A{}{{JJ?oTEX@SF zt)h>IY#~t8NP8a55dy%P2z9Y4*1m@3X^trwq)=_4>+^mG;qaSMVkGBfL zQPDa5=D#v)=%3z8KsEr{S*jb*skg??o&CV|AJKL0BH-K9c5rUs|5bj=ZiD&vVqGxo ze`a`6GFqPAH!U^GQjntJM|u}qgDXO$ho;DkaRXWGFWr7@Rq=l2X(w#)+K<3*501V~ z!i{C9ue(ueSyChhFQiu#lG8GIl>U6R6f(AV=3<(a_xh~rN%uQwpD^KRJifGE+N*CV z>Mok3IQW6YFk;ss@2rkBP=3y~;zx*1UeM`8UO@Bj4kXJ#Gv4pcgTQ*y&g|> zHT6g-G~48){d59e_Qcdwr9jTY1NHS<#&xg*^D*uCYGP|BMv$dDxFMw9u6EBy`8&%v z@49uXL;c4c;p?crw)V`T%wXL?krsz_hxi) z-Ci%NzqiXZOt*#>{?b^0{nkSz%EU25zdQ2V*E-#6OEL(`8=@8(*eRYg@ZAqW#wfut z81*yL=2cx-K9t$p1&2F1Z?8Knss|p~b?8l>1HAfMFBTT1`c33@$9qf%k#-3+;>K8b zjItV;bDWMwJ%$8?P;eXtcO<-NKTDbp+%)>iTlZ|eA#c{5QObv`fCTG)*;TJ)zY1g=2X^SHCG+<%oS|EkLek#`I+Ue(is(nUz^ed)jqK8di1wAgWbN=z-e4E94% zL4j4*Ew%N&6x;lOIF-SXs4uIfr$0qo<{qrlcV8Qa46l#8Gesu$xjvS)+hepN=?vGC zKb_$fy0$Wrjpn%_T_l6xI2K~{Kva>Z(9cS9`IZ^k?*4Q~27cROkA5B7R@ zE((*`gW}`#k!jY*5gCRl-2_MKjzjEyk8gYMoNIwh+CG>SSbUdw=OlZ5bfirqsco%B z<@x@UTrMlAC1NFN`5KLz(EtH6AP~$z(-;hF(z) z@Ox9XQ{hIe2JiP@c0W~F2oi@6v=s&RuTn;#Ny_WK)l7@`Lqq3@KEiDw303W*1+~mQ zuy|QCSw%^IHg>ZUNG_wwe;~`NkhQ;K5jDG_;n*u_ZuH##8ovvdeGS zmG)qTLthmLWu>^r2PaP+ zJgITQzEdexzXLbwKV`MlN%LErUhS|FJ40FJzt$m9{o~Whw>uvb_DAV?t1mFj3@xw$$I8;=~lA&wm7cV{lI`V`1L$_!%G2z8+ImM)->z;7sTJGZxIj&A>)sVg z&&uku7TQqrmpPhxwbotQs&aKiqT)-(%D#L2Yy);C769qG7-dk{JC-%1S3F_y+TCA1 zY_Y`gj}JXNwF(qc>f@!qTvFo=;o&7n`ZYLdf8VZ;-73J0;45Th;gV<&5I@ee;+(Vq zIIeW&er(TW8lEGpiG%>g=fY)_9srEFaz5dI+5F>`ap09^tmCMq7(hHRYr#S%Kt*Tz z`Ho+CQtPOHeIEPI?>(*5Gfmi%*bv#KdR`%s*I?zpVT2IkFENy6$QYx8opO+t#k!0J zIuu8rJls|8foID)`Lv1vwNDS|5l#IX6h?t3Dk;*^%1hj@%rhPgq?OzrGVuD^y6eQs zExTtPDZBRdA8EI_gFlT}CCms|514fzp%WzP+xI~YB~81~8sAw*y$uhn5Y_s#KD>C9 z>CthiX#P5gS$4hyHst-n_G)_^;AqV*|AtY$q*teXqN2zmP`<_cC}-sk1N)tA{`csL zF=6G{aG&OT;yLR3Rb#Nt$QAiEIL$_6Y?>-WAsb~I(iri3HpDy(&}9Wm!+SAY^)#^$ zA%GAiSmdz_(*>AIzoP$5?%fIAii|}GA;jjev}b5*`mQX>S3^US_B?;Z;&a=(k_wQK zt}In(j21TC5!D@!;fHc8=+~cDgqp5is83uGWg2^F%I;Mx7Ct4UUh;5*U%eH5jcEZW zCET>HIF6ez^5viPQd!?@9xXdh}5 zn&cN44_0s?d=ZS6+jS9^`MjR558pRERL@O#)8s)PTE5KGchABQx|;!*l^ITRaGScE-?DipjP#=H*Mi;*y7N- zTasDWvCQNkQ@lJMLaSM8E@! zau@Kl;x0jdA_VQxXJ|{4n%XKZW|pShVNK#!hdzrm9JYx#nq=}|vV2;bFC^H$uMtUD z2&%CgCE^+Kp+D_^)G;+M_a+_6c^V5i!5tIVFY?Ho4ydw=W*M>HV%>b=vbFmWaF!rg^EnnQ@7Om*1{*N@*&AY;*02?)1B?tK7G90 zxomEJbQtUoiuSMwV}gcS&Y_94WFyk8dv4GZ3Av?t@7xW%zNcA8)G4baWaV8dwh^EB zL~y5*mpzzTEG>F5Qi!9>GN!FWQu#()a2K2UUq$?=bm8uD6UsK*^V-~j;xgx;_H;O_Y4*&{;C(WY z@;KOi^@+RRBVrG|{l`ozR2-b?$8;&}W*6vM@9*abpfAwyaV0Yf$KFkrJbM@} zGMb8GK4cI^AFzN0j~Iq9+Ts9)iqIdqfSe|iQYHzNBL)}qX-8u_#N>NZ*0h9s&3@tY z$)gKtqIZKdR3fN)X~z}$PkchA`c3{Dxb!{|OncI*W$L35gKtlU5rsT-j&7z59_Lg~ z{5w|YTEVZ2ga0Q3sC|%SMp+dcWjC}2Gq%x+MqMv#53q(GT@N5viF~y2!cB+6#KxEx z$iHE6uSBlDX?)C)6PbduMU?&um{Mzj>a7_|7yc2stz?7*-b~$xqxJ9_Y_WEN1o`Hls(9$y_zn9`2dzDptRt2lfF+$_ zaRY+UG_YIHu^M1&=D3IE2%wP=^%&y)4KwZlE`#Rk1*}^S@SD6e5l%D?z{{pSkNsnj zsQ+H%|C(2la}GOIjfWBMBB6sQ5F%rGP-S{w98^gnqYo@BMW{QEnkk@Y@r%cvSI61( zJvWyg@k#MWv3&LHIA7<{C!A8O3-Bdj6p<`GV*8ZBfGM ze<~qlaDNBv`P2V-C5FG9f|xZ4U^1yM8*c)YK~RLBAG0L%z@dMc)cwi>Rc_Ye3};aL zu|7=44}U;IHne=!zTPoia5~+DkjtV@EB1}A_oj!}3r%`YyqA9D7mq?kW))FeZ?5<%R+U{`74J;=9RDfrxq^7kcRM`hb9DQpARCMi z%VCgrinbBXW|-#We0*6rFw^rkLtmh}eYWl%KDiQK|G;i(^CNP2jDlL4rDsy}JCIxx zcdsnMWj55E?Hm%g=l%`^P{7-)DKNyFo->Pj({r|r~@#t3g%O~6?Cb| z1g|4D6+WVE_d0hOql@X7sne3)*A!1v%2B67n4H~_%oCm%J-T*GlTgSeRlAw3-CkjV zAW~MjLLOC>$ZdNN2hV$<3&oE=gIdFE3J*)}I=PLV2B0Zph6S_o2TC8bw?POF7a=DYS0r{X3vVJS+#UWZ(?bFeVsH)ds0hUr88~E~A`+jl0-~#!^ zmO74HAq6$uL9u!aV4*}w&Un4^VD97*deCD}%Gg-wf%y4F5BE=>T;k8Eh~GJya$A|5 zY<7&r)$x}@lpt{|Jvsb*0F0?hg##K$xH?FeLLlsv1q#|{_e6KmOYZXtkSFqc>fH4$ z-Yvamo?CC`5rqu7cG}v}2xh>~WKM0eB$!k9X7~=)btsr3KXKg=p5QQ8Ub2FYGO}bC z22?oic9OS5c7^8Ed2~f;yg5?%a6F~x1uz>2*?5i=rUgzj>B0_!k1+^bsikogjnC8A zGv!4^lbx^TE6F9T8I7P&c;3GHSwYA(sk4?p(|FGCf)f3~#VPy}ooOf{hiS#v*0yTP zqfg(CK<*5=B_e4bQa^iz_@d^_y#TUW*7O}7`EQpc?MWi(Ue^7N@L@|Vf=oM0hJ_ej z1X-CEz-U{=1Ws5@r^nx0(e{1eFgeP(qHr2tAA2ov;A3a!4DmifXp(iIzeyCby9|Bx z^m3Wz`xPdu*X5~GKdNZ{+So5yqBFsDT=X|AfS*nt@nImqzzSu!5Rx;mH`~pyY+yI)5g|3RWh_->WbK#f=3soY<*YZ>!Nt z>HztPW*|APNvW?ir>{{+v){I+VxF<;5vsrKbm;TC$-cC+0y2Ih9a2ngCZHSXAa-;5 ze#4{#z&He@ZWP^$vMpEqJ&!#xwnxg_UtH38$=`I-eyf{t zga&W^0MHmarr>;#zExj$q&7ml=Xzpkdnl z%Hj5_+s3;hjs60D+=t!`OyDZsrG!UfsuuAyW04WFkJtI>$6w@!5UG-6;haRKO$`*R z8sja|QnOwc8dOqz_N3f_Q$VmqEQ`6`uFVpAHiakojJk1l>(Mr+tlABE1^^ytRb)Xf zD979UEbVLHi38W}U+$+Q4++L|Ywea?sgxv!OqVqXUeo7?pX#?n*nM$5uPDFQ+936+S9zT z7%lWAD6J@RMQ!m}kKPX0W4^z0DV<(`kiQqef{U%2jLEJZr* zDa$H&3K@%)VC|rgJG}5~9zmx+U77v-of6{gK^x}?|EA1J=%>FSybq{i>sjQ=88|F{J{qJh$_Rb zp8dRWUqQ&lRjOS=(}9}T23Cx2`MdbhT4|2M?v}G0Cs)yT5g%u@(-tmXsjiYU`$nYw z#6l`Kwmgos$`W$`upUV)DHY^VDac>mND(5<#Pc_-)oy-5N*Q?+Ndag!nmK%1A0ADH z_E&?f40Q6Nfj4Sa!Oo3i;Ml3yA<#1*^wl5YUYFn~0Q|wuWFW5U3x$KLu8!jD$3cEX zmbe)wp@Xu?*VTnkXfrS>4C~<(R@8Js$^wDzPs}5?W2w@g)a`EZ%PL0ZBPGw zsnhBeysLvHZpGQ1H&x^_%8!B$`z&9k6_Ln7XuAhjS-dpzxr*4qjuu;W&byhVbMLhD z?5?DIIV$YG$#{Ag+idjO2c(+K766sJb3CDvL>D+Wa6G%|dZCNPQ_K@h?A5ERrul6r z{7&o?)hDq@lIU}oJPj>)z# z2_T^H53vrBFUT>M4$uxB%%T^yAh_tLmY+}?J;)0|T|E8$RaFF)I%ajCcD(aIr|8(n z>-)2BZpCrE6Vam+mpr|u0YbeQ)fhH4*3Eqltx-u=C4Dt?Y@p4gVih;4YWadS_=5LE zouDKJog40y{<-7Kq=iEG0MioFhLvm#WS*PDSCf}&r|XQkrGP2o4~G*yR-dAzSULdAWN{wK zFPg7Gj^6ctRa5KttSpv~FH-Tx$xtcoGJd|6hdRGuE#l~M(X&iE^N*U*gkpCpk`)FV@ zXG+V@^i-i~ra8L9|A*DE07H_UZpD%aYr*WJ2S@6XB^MEfv|_}8!N73q5pQj+)j&_z zdbjo4O1GYNUXA0q@Z`#7g=Zz$?_z-AtJx%v{Q%0}^XQEP8jyCcW?HfK(Aqkd$WuO} zyjQBBqiV@sOVedBM&|jW7h0>cvU5_(%&Hz%?2fq}w4Xbc>B5jHr*e`V9BTb5q zOg}{~$3Cgq^q(?`GgR~bW9w0um={l+$iH~p)8ZO7=>=V8d6TKf3Tk8*v-kj;?j+6h z1wGn?X8YJbXCg%XQrd}9<7CNP{K2MH$C;!={jk+u^} z>YOiN%psB@Wqcz|A0a>Zt93xvTo%4Rz{dS;taGv7Rn9F*iOQb}sE;^El@N)RpoqjW zi15I-&|?D2_fLyy0n)vi<7@57jlO5R_@$3_z>--NPD$sDZbkD3cd;J`&;CTkAYk3o zR>KI?NIoTf312ccKH=`$luY#nT)p{$w4(=_@(z5yct=wOO1X38eNUE|!wS|Lz9Zxr z6_`7bVPy(=`HYut#y3J}!L^Wg<6Y$|TG+}Z>kXBVnuK&gg$qTIxBWf{0npw$;{f|T zhd;!jYs#v{3P(4gO`9iZRJHe49V%@+b2ER}Va+%7+CR!8#KrkkVrrHzm+gN-$U(Ym zj{x1YX?*+YBkUeGy37c9YNjx(@Q;+!0t=SGf=A)+tM{Lb^KDt4g~1MoXAMp?@N{<* z#;iaH59%MYj6pWe$eD&D$)O^>XX&XOQ9tFJi-QUeW(IO`OB7#dh`6y8%52C~^q7JmR78_fk4B?j-ooYF!L&Z0UJ$7H{Yjy*@)n_bIDObAfWL5C znRyA*SqbQl(xCNi8Ry`vfmWrTrpuqcxsDj++PU^<=e_wM-udf}V9Zo2T)fefd3u;L zh_Ar4!XaC{D6*pO%)rmI_EW}I169L6)|DNf%FCuHx0rX1N`(yPUU}RUXF>R6hd4vS zHfuj2@*t-g90Ca~RwCCz{Dgccwk6mwp(kf|=e_qb?QyV-ixfO9`%#wAzdE}9u}J`m zX6l9Xou$t_^C-S%mNPk?E0a{*Q~bSUMxX!V!%v}C`hy71FuSnL2v9IT#V!LK2run8 zvZPjH7&QicG>o!*#;rW|^MuLQR*7>pzQ$|DJGl-zDR%GkS{bWEPC7ee44fC~uOV`GL}F01i`}~*9Cau%fGEER7ys3% zhZkF@&wJM$8tPkUXqa<>nX7bmIg-EZH_Ud=%^1QP;!V`>(Ddd~7-bO~p`;8w&Vxvl zJ2O0Ija1Rh+q-9ulH5d_O}bXwvP6zRe9;st}6nmYA?Idl+Ej*lS~E9cNryq z2o>z>f2#BaHmA98!QfD1lDTe>jJ3-TjoY`!Wn4}w&mO$+v@H)FoL4VAgJX&5w<{eU z#NICPq^v#-bS%MigN!u3x?5mgE^qve+xxzNt+Qv=g@p&^ap^m~rls&qQ5zMBd*dNB^$uRMRE2&yFctR%{6NgBnY!^+ZhJDdQn2tIY{A9OCqe)lkuJX0!b<6?7WUv@7pLe(kU?x z1$49eezhh`L+ieu*Is4WNPXi^Rjx1J^X|vp3}?Fypl@f_50M!Ml^M@~#f?V?u*>t2 z?mcMusr(iqf9Kk<&V1jrsWZyW!$!T=hV~Az{^XPZM?-!}{B`^4wghtz z=bHg`DVk{nc}_*fthzfLhz9ljAn_|hGLq*>y|rWv4Aq+9h_&B0w7%?_I#%ZlN|?N$;?|i0 zxo|$iqXB4AK7&2fkDTUUx97;;hz9_ZlsGIf-LdS;l$}Ra65FvcjG-p&J|hz^0EKBW zlYX9LvbSn|=jBZM&#yi{D_E?}yBA`|2n4@7XP%?oS54T1=^4EaM0$cHhiZ&l((Z@9 zfqm@U6L9H@-;Z62YDIFi@6*mW3X&ahgEX6toi&##gr9!(bqtG$k1 z*57(T;^ljJ<5op3`*D*fsQ6k#c}xwKSVH4T7rMb%E8IQX&NGK+hUiUeeC#{!g1dh> zvW{D}kYR-G5^GYg^k9bbAU?s$3FrxqBba7%l%LSB7YHAS$-d0PFYaGGB0yB zdhBg5-aMGscGOvx%;QUSgCL8_l3|4Prho0{XhY1FY>~@yKvmQESaD8;gcai$m0= zUPA8ktvj0!WZ#^RpdNb{oMw&mVTJvM?L7zJ(R8cImUN8d3U;!@NbZP|K za*EzXQ7ROPY}z~b?72Cys<=gs7^^ZC=SWI_8p)US`TeCmtaqm*7!k4RKgeTPMh9Sv z@8o=$Jjl*(()ojA@qMq7$;%VWi?74fMtnTHiESEdqOq`PFz+=37RUI)t^;P!XIO@! zK$OLLbc-9m_Q;Euw*44dTc7alSPdg>k=dE&vr;f!_;H4~i*<%+jBR!$PQBI8#)#}8 z-WGYh>C_xn=AU(0&@}6t&;wB&yFRx~7tZt~O>M*-036PA^<$AGl#zU{_l1 z!`VZHMQt-77A+}U+jfrv3K8xl%T%A|lc~)aw~oo5a-4xzUR}EB_tX0>(YMQ*cpe{b zfMSW}Fpkl8CKPk}9Eb@`dW+BWUm$X(t;Da7zfjm6A4*Y;eKSIS+#o+`r<6X5Wf z3EonH_ECBzU1kzGSzSX5c~bKlrw+V{WhP=mgOTb$?)?+UqAsAe_bVyS%s|7IO<+7B#rK$|TlB8(-cgZy%F<817y1msWDegCG}pIFwv ztOEJ3S953~QZh^q`?+aqHoO~TtpB_W&|LhNi$WOSKg7@ly^iqaZ%lkDxIzE4kh#CP zD5!Umm;d_~{omfA=|D2HcaMX%&S*S)^42|x z^7FAr2fqB-hIzqh29uF+kKxti#JB7W%n^4x@~$QWk7V8^nrh7MYcrP+9s!H-8o_1v z?|2o2=e5!5RJP8hK}}?V*}9pAgckhH zl+mZk>W_1>SJR`#($8M__3O^}(!mrY+MUoU0;$ZhB#o(rb}(2tpRp4#Djr*~Uo}a# zi8W?Rtu3)G=MY6brHx@X+jq|8?4Ettn2W95pw!hh=#+>4#n%@00cSPo+Y1g0zbzDy z`gH-oon`xwWDu~NDNV_o1F!;X*qeVs5C2#G|4(hh|5?c6-v26(oGXtVnONv@{6!&2 zv8}pA*ShU+w3*o+S63i5wo?g61cO>A!z$~Ii=J{^sHw~NW=U?@Irr%4Ty9^!)-^S)h`S5zJW^YI zz>1f#OY{J|g>mA$5sJP`wE}uX2|YloO$gW;>dGEM^@=3>lQTbA(RC^E56hUTin zZ>!`<9Vnlv_5y*;yT4dl|AvJ7m%q`x%n;6SwkA7$8O=>&Qh1}68?ERP)VGZpywMuf zd=8V%x8()sJP)H zb?y3uCt@${OFLG?L)=j}+eu^nk(!%H{L5*1j`MJvAqBPUQo=gJevdhf`BYo4_u)f2 zX+qnqzoh!m4;Rf2#>>kM%I&5-Wwkt37yJG94Ksscp=S(3cCj9D)Tb3M-FT|ReQ?{3 zrH+y%hdA(Q{%~VD*%E{71QDjn1v_zn1Q2TG-)4LV@WXp=~3uhmMCVz2xVP+i<&7Ehv(evZcI<(d1(OMkKBUCyo zXD-N}Om_Zb%q3!UL@>orsrj-w@&Xji*)0HsjJm!E{i1rA< zynaCV(M~yd8&P<^L)??IM^IWmiN9iNJ8by|SiNHdpx^{!My~p@MwR(1s?fMdNHPH_qhUW+mwlMWJn)sN z#O2_^kXf@@$i?5y3k#3QeQ(o;`5H7YdC~R6RUzef{t9$m6RX{aR9N^yD%;|mXrXm>?Tv|1Z&>K_e!)(DI7CvStW zDnhe8}`eLa>=<75=46461jz1Vy6EU#EXxSpfh2PN5n=2*jDkkb`o;$ws zcA6pXiaCITy8XLDfVqPiMAi&-pq{|1wI6{`B#%r}s&R=d<7rLt-!KPT;$T#=!sYqp z=t~Z=lB`!R(`4s({>Xh(+Mc}aW`eN^QRncB(Yrtrti*yd5nwhUPd?-kIK4GfLE}{C zdtUM!b)ERJxAqyRz>ZG5;Qh3xT(#hG--e6FaIeeHGw*S(-0y!$nwl?3i^p2$J&5(#tw&70OS+H_;qocRL*gb#0Q9{MhXLD-G!=F<&61D^l#FkIz*SO{W^tN|=<73<(S9bJYNmerD+tYap_O?bWLZ9c+-Up95;1^(r^6%Za ze7^aaA@63*Mn@^HQ$X%jw;Kh=tWF==aVS66`nCQj1iK_=p>VoBB49i~oOnIq4B zwk&s`id{`kWZJbLs0>P>yBQz^}1f zX63cX)=J~O4EK@qgzj#|5E&9t5-n1KK1xTQ4OebWlQFoubep?=a_xB38PLrrCnOW6 zMuB3-K{|+#^GPTHBTqHFJ!qRPHu2OaCt9&Cc#pR>wVZHG)Kg3~2?@nh{T)eXL4(=MJG@P~$n%>OJb;%Z43fcS6CPL)w$vq50 z0oKE6q>ln$_M$1b!7GD1yfg#xd{9%{lAeIA;l*~6di#X6eU|O{mwt)wNJ;8jySoG@%$lSpR#+rZ&ju# zHr-pRzbB-dee;@tm+ekDz)a)tg9P&s1?mR%khY<92i7l8;Jd~d)CU25a0Jo~A4RTQ zjR8#eLP)}-S2y8~X(UVcyVLMkGtbhH`_?(py0r7L(ZKwn2fkX5Gs1}_agHMFJmAIf zvKvI`fKQ9hn8j4vXOHYdbkp|~h@eSPGHL9!QaACv8@$!FFL}sgZ@Iwtq3=Msz40b} zh{HDm==^8c6&M4wDG|ydSmn2BuudA7xV*l0*UlfaVtCG4MNIj`AC;@_&k9bAt=BGx zF?rDzkUxmBDxu5F18BM4(njYZNWT>!m+{NqRQ6BDKV0ko6x9AyZ2nCr-Zz7Nk_}KM zu8Sb{lVKp}ZcAG^cTvNmrR8q&^Fy~&?Fn8lT%$2NZsetVsDGYSeycX>uvsA_(q{D( zS!I>j#07F?Mrk>_B7mU4y3ivHG#F^d?#iLF<%p5FBK~i$jhV^?9FBL13iN_5a{cf^ zPUT@l=sq1*WAcpSP#(qV>4VA0oAl%B%W@jWJ^AFgJALvk7@4LaYpePhQD+W~qRU{i zdK=g$m=jQM-`NTpZ!3Jb@=`ig#m~$&U=)*{4+@BSDv_k#iHewEd_QXPWtg2M7d_};eEZ!Dsm56T>0P1 z->TJ1=2F{g-)&q+j=a<}3*n}A={R%KL<5(beAGp{v%hVyr0V5xSb#NvP(Com}n*#Gxm4!-I$+slO zn_G7}YuD&~>32mP3n&I(j6b0stYiG3X$_HXjJv-SccFhah*$@S7rr=(rQp{O>68x8 zw(VUB4ms=aaq#$ji_P~FBqBfTI5zn}18})^H>+qFcTh-O6qOy4(W8R{YrKqu-=`{C zQj`}=x$BI;Fno(+u7x$sxqeDbjD`3M`R-E?UR13etaoH8$bkzf(EkJ@7jZM zR63peVt0wNv$wn2V6WV5!ScNi(qH&yDL3+Gim|C+2U<(Z!4pVvNPfNqS>pta%_1ip zOX4pgZn~{JcqNc`$=~&%m61|yKLgbakOe%5pe2GVbji#`r80x=Jg)jyMP&s+(Z?89 zc1-|H_7No^uXzk_XN!EiouA|tiBe1(U<1)!9{y#oMBOJAi$ojj`suzBR1>g)fviFC zw~%wMBUi&=a$k0ohBu{~T#;c9U=HVFZum|qFEtSKG4kBch%I^`h38WNwBie{=2!Ig zkU;D^dqH!hNnK^Eq+5N$&ufiRQ~PJrJYC-N-DVk{JJJ>s#+D)=S2bA4nHrT^&yYCW z#oYqCF6qQG;sC0D3!BlVu5K)&nip405YdyF;8qFsk%JoN+9}sQGB7 z39tPAbln>6_gzi=(l!Ff>_n}OHX@V4XPPO z>|O49f9$n;-MdXu!S~}LaqYNZpb<&RL%m1UNQ(?8j5$P+Jf)_%GDcnHhsZK^RwYIL z?wTfZPv@!~uf^o{ExF{M6nm_=Y?#^NW^WJK^LCI3q;w#y8|+EA(ay(hByQDI6B`KF z-^JEKJ_a{1FciNqwzVKx5Rq9gqUVoY4dQs(d*w50&-#(p*iRHBn zSm7QH6#3LX5IV;H2r#_N`<vlu`QOf25(xw8+rZ>7Y@53!h8kIl>yI{Z$xQrgmU6(yBY+je>TVpR$WA+xEp{FiJoy*P15qfvj zO?d#Te>oOKG*jf=TX*v$*_?R>=Wv+yhFpTzEYUt z3UBId<8migm0hu#*Fxg&dq0ue=+&@Rxu3flE@qB1Jfor-seB~DdRVvCvcGbRSO+|Q4*jOP$~)0_tYVMV z%a}h$jL5CQh|L#QA%iM6agv~`46wIp!Y_Q^!0qF$#kKxP7?xETq0HarKIPoJuJ(Tp?_%{QAG%ix{){;t`VC5UMp5Iw=UpGM#4#9Iuak;RSWxWDSaMg*kTj0&EQ-^wmE4>-^WQ1PQLhOi|@dfKz>?qU1l- zZ$E&E+7Bd#-k$zGsMa8F|#6a1re-+uIH8xG`e|>wTo%I z%Z`zbFG}Eha?@)_E*y^Fu0J|1GE`R2<%3UTVY(co>6yVArk-J#EvvhZcJJ=-@SsW2 zg+4S1trI$)94}abyV*8uGX)F{SqCv+40io2aAW!j?;l7V!~-Qf+~7H1soll8gLX6G zIxihemD<9&cDuh_p8w7pdmtl&phV$+h3Q|(%K}|3J`^XmpvCU_m7dN7CprNp7yLH>t&x{hyK9KGJ$YhbQ? zUmSVxDNpaCWm}i50E?hC_>}lQlR5%C8VBJ$tgGxmfO&gynbb}#D5FvPGUW!HGdRPEDEDHhpP~0r6kpZ=k|xHN!wo8i@ib(iWUnf}#rTp~sd z)+KLc-*zR@_~1dNysSj{7t^G3ZB6fvO7}Y5YaU*`=JMGs*k?kxhg=;@8>+aE+UG{1 zh2RRzL-e+g%+zbRJKyzrl$)o`*hs}M+ii)HO_Dy$JER9_Q{OBLcnJFr@rJ~{XCt%u z+7r0i+-ltWr8-Yob!{A-TtB#Lam6G8BRvAAoe9@>wkr1XF(#Bneu}O4jy?9m(j;z+ zyxi7-Shp1tJYzPg^j5^`j_>O|5bi$hmV%(=}InbjN| z;W`kU{7Sc>PO|n)t*Aw)dE1{E4BR|uNAwdC2@R}E} z*%*FXLnjea3+l3a_3?BbnXD{rl&W9HDJ1^>JcJ(Sr215W_{fJTG9NcY8(v#%)*uvFp4is8x8t4n&8_GC*2q5ax9LF@ zh)`zfNakDEIxAokCsj|~ahxc4eYR-QoPZ6uiY=W%yL?5Fyp>w|!@0Ng&agHD57VD5 zeF^MaZmpILvcR(n{o;0je&1XWI{Qc{F5yJw<)Uw*TuFIpM#%cc1AX|1pWWwrSZ&vz z=+to@rxLUX*UVw1K8icb%&nC^(`BDfEvr|wQ@33JU4x(i{q>8tw?4eG`3OkUtmZ@< zQRyC7=^Ky53_`=dlNFmBbhqVdejHRxee<6w6G)x-BQTw~HQs82<|LDF?kf1zJo$82 zxm{*c=EaMHHx|ZE?0;%D(-IRsV+05y0$dqa0)ORZ_{`Xb2rrcp+=Bvp&UHMi9vRWY->sIZz0U1cK)K zUm_@hijU9S;tX6=4LHL| z);p_*7lD;cbq9BHR13HAx461VE&}C`O~u%`4`UY`K3bdvz^`{&>@CVnkS7P-K_8)=g4HtH zv(PxZxC-0UY&oj!0lypSa{^nerOH8$16mz708bnE3mM3qH}xS^9O*@@W?y+< zWL57S_A)TkR*8RivtjF}K2#kiboofil@ylT53biZ@aC%b zwn84@v2`PK)nr`pRU4< zoH8IS9y-T)3(SPo$A}H)y#1hRXP!HN#@4Q+dKPqROg|kTwWLfP|1?)&a)d8X6|qSw zzZ^quf4S}SH!wUwpbz>No4~*PpU4Y>D`CKQY>9e)%$~mnw{fzpIJ7pAiyPN9`EWCe zS#i+(!1jSPq6#!2Z+yB-YW!0Yc(UyK^U4pX1nsdl1)@3_O)jWQ{w&0zq`3;iTu{Y1 zX{uPcJ>Yz4ofJMSjYE4_F2l>l`}ljOr1^C?Hy4}Ly_=L@b7Q6_N_*k>oBE1PP2B!{ z0Nv9DP|N%vf>+?m6c4c_#!OqxCD@J?557tJ3mR`8aG5iab*DT#u=}tr9jYi^X?`f_ zo?`0nvu@jUF>7NU-g`VUJotMM=_SzLR0Hngr?K~==m64m1|3>2#cAp(w;#{u`Uv~m zym;<$&EvSmiA^T4a;Fbn`AkgqS+EUn%UOO-6u6ZQ9zeF@{|XPQe*!!DKeg*uF+%^u zL8AS{6MM9a;6n-5iJ?-+i1L&O|AEwf*AEiE*D!fyKI;>DnoFu$F}fpAwGeb26g6`a zz-Hpk?Wt6zH((`Ju+R`(Q%m3T1cPFxSbs z3nGnqJ}}#Bpa|R30X?wrW6edhuU%<}!V8(tHd#7Bf$#J~uG(>XA-av*+Jjs%XSf}BBKhV*j2-vLWND*uR<|{o_nkJ+$O?B~on^J|MvYYW@zl<5 zXT@ZFBcu%OX*jLQ5W{|xng!UrAP#$7Q~~i3j#p2Z@l+9YWd;aBujC0 zbsyN$g}{yh;%UD`IoL6cm9J+!4DVerC4ZO{XusKQ$6XTqG|Um ziaHW+HM8d>`> zO+R>mM!pmMZ96HGQavV&tR9tq&y$-QiFQTtY~3CILHxNy^Cb73xHyXc!LuVtowJNwl$zdCc@ONL+kY2t|E~M` z+ivV%`Tf_<_R1Z~e=fZ9`rhKt@(bb{cn0xjL(OLbctM>v`y?2-%6!x(F$hKYy=Jck z|3AFd{jYFve+?M=xBZ{Ze?7eV+%DQ+p|1kC@1tsyM|cbWGjspzm}XwOI#)RB2hWr9 zu1Kafy732(xt29+5_KO(cQa&n2L=IzZBx4oVglf47$d=IYx92mgYFh95HT>KB7%)l#z$4Wdu{-G_u0ABC^ zmznrpGYUFF|DILRN4u0Ca(LIAyzP@!eNVT0p`>Bp%i|25h-4Ugb1;UC$Kp0Dx}|IH z8--t-4l*=x$7>@qcmfGV7_dLqjv`6%h{aow5R#tVh5Q-z`vb*+G;Xx{SSVNQE;x_8 z1^-mM*zyqYTSbGX>shF?i48LP8vLpMNM~pW`bq4{A>=~@Y(0%*#~30~&)Q6Svh~T_ zbs+XJQJ~j?9Hv73KXRDR@}CygQ(w7~(3s6@Xx@ki6K07@BG8wx@GcY1He9U;TWPWV zXVxMMT6t^UfLJOHz6|0N2cV+UB)V#4*Z=6kG1mirM!HzE8%crMlZI)83Y;H$uy>@P z=Ot~bDm4P7qnkdpb?-4j57fqC^sGYM6VDggHoFNG#wjCGl07~p{nD%ES6s(eBwT}gF)-OITI z525!HSZQm>1thNj*thC`d5+=kAn4~%@UEh)n@{ihQW5lK*L3aUUG;Y_UkT&^4z{!H ztJsdP3B&v})&_POPm-%U7fX+GO?WqAOgy?}U$RIY4nE|`wG8$GkX}MCVUg;Is?_mh z2j>Du!$fEY0OSDGE_T{lGPe=pho%Qd2?5^3Wqy#T^=WXA+*9rRFV@1U-|%KiDp1^` zI?k+lXiD?TnZTdV2E$*{tiQ+4Ket}ANw&d_fW>qYbbRr9kc&IaR!#j3N|2wo{NWbN zB|yL5uaiaoC+hu#slOHe`bY2ke+h~Gzq>JW!Pejg^sI`h5-caSeIMa65|-nPr1Kw5 zpvdaLB{yGT;>W&u$o8E=+@e_wy)w2k=+}->+ITA<|A_mptQ92dHb~3tkdC;1*%e|H zVbgb0!WQ=7?kXu%ke-SsCu>}=&$Co~Kwx9mw$P-sQ{M6Ep#$?Cf2U*8B z*h(32W3xUKhXouZpBO_fbP}Bkv$jJb3G!D-+2Jd8OpuWVd1Kme`Qk}FZ_%4aC+i%v zdd-vY=q^+owUn8Avww8qV&!>@nD{UO>!lm{tcHrjtyiro%s)=Nl3O-!$VR|B`Y71( zmbyLwrir@U0BzHZ;=0+RhfpPDk4k2XdUrn@F}^FJhP|-zq6S4b1)9!U_b-cc#o%3} zS!>zSvj=c(h7Fap)Cz%)OqtY5GXKF9oB~w47E^~aXq>^- z07MIRIkM09D6xPTit>lAH26|`e=s+UKNzB$x`O8K6sU~}aULP39;3Cu)>wpNdsOD9 zMjvMi{`<}U^rJy`cm9cqUIPbp54a)LmP>1#+GoR(9e@@^#pWA9@q+Y_KtIKSEM>RT zeL<)JAJ#lMa5 z?=2zukj|ci$TJ||2##f$cX!nww%i#hd10EQdf8SJ9oL4PG(1>v**0)m`MV?29Zb== z0G(pS>m(NFJyxI86Mn;OMQ!$2Z+9W-mQe}4A$#}1V8`s_jY)NZO?U1;&Xy0=TypYr zZa@Vg1RxiP$AP^UCXZDh>kVYPH>NXnB9%2vD3+DD(-U4g7N9Wf%oq|ZI2d^d#zLt0SrYqrWFXm;&356$ z#X<_ACHX-#^0&*AZZH*7PNI$WBnKdH!gU13|ZJ^7R=e(}mG=-AE@VPx5 zm7`~dsXU!EClmm{NNA9VkM||(&U;rcoxNwQT%T=xmbf>jZ|ffAv_9}?`jiZkqJM2Y zS*WiHFe!-3MhTRzpXqWnVa25o|1bU&W3#}_PUa4k0^400Q;mV5ihRxG5~NHr(9WmG zchd4^_?zArTh9zUp1Zkfr^dAg@q0)$0%&j)w%6kZGs!Ucb3IJ0)Yrj-xjrrlI5D=P zV+-2x@JpfdH5@+9%sw8Uzw6Nn?gj-yGEC`|PDt4O) zP2yCEe&IW+SgrT$vlopkvHT%tQ+!LR&t?y`f=_>xaR05x92>X2xe|X=Sx8K7 zJNA^#m_wN_P>|hWP6a%>g5v5=*Gf6@m2xwk*hd(#2jbWoo%qg8dBvH@Z7sg;*|#^} z+aJ9_Za;UnPnnG4g-yntgT8x)IvBi*INR}k=tPbh<=fV1f4}au6sD3F)%J>_an7Oh zH*v}(D@K5X;DXuPaijdqSj0#C;)t=poV&-ROAIriw_9Z-Z(2VI5}?|NvHPKF(MnaA z%(RBb$nEgXzD9Rr5O~5iMYH3`kim?hZ)T>|zMh5d?;H27coRw*myRl~+Gz!)uPldE z*MF=GO-||sx%t`1e~H@F|7UXd9|fU*bnG8IFvU+SF?)MFAmH1nvUCr5?S$aP06k8^ zq8Q^_g$VoIN5;8aR<7zEex(KhB!nnVtc>Rvs_(sS-?lmumyDmd&0V+sOTam>56H2c zuRoHdjbrQeuPC2y+);oN!M(e%9gv95Uz00twMR<@d{2mpcXq+Oukh2bGz#2P_1~v0 z7eUpkj_`2MCY)N%_ZE&gS=i?Z90g7#G9_ud$w+yjabq$r{;mbTyOBlDcQdtG=H{7F z&gX;Tk5#9ymH33qqt0F1VYt#`f3(g)=8OCGZcXQy z#iuC|<82$$v5EQ7A2o;*tuV2bA z@_mbAWH(eXTHWS7uYu2#Y3x$kPQr5_3HB8KQzmy!Ds1Vxedu7koB!+2gpsFT=3GXg z)%zV@*|1HfZjT#?Vrl>ZbMEEkzfc3_Cpl9^(gQ4E5E%2t6mXvJfM1XieDJY=LlJgv z=0Y>?ZP%8WhIL`q+QdW=RZn-?7Y4KMavmT(@IbtH%^XHymbkeR5pc=Y5MV|g$~D95`MMOB ziic)SCrFgGel@so{Og4yk)L*&NVmirqSw4!4?8lq(49jvz?aIw^x-yT z36RJ*oT%)o@kxekJoA1G`)}5(xwv+od3C=r&%|*=IFW9GmBzFn3lthqj|ifeU1+NXYhXR7 zwiONrmZIV*>1{@@UTkjMc92foH{h=4b^Oy#vsfh_T~*{+10GBZu8#2ErviBfDob%~D@gu3jDiuv7LFWxRuSKtcVr2$dv5{AkFP(%|% z)xQKK=!EO~lZwR8Ixc8@vBe}$IKKRlNDudB)Y zZB4^p&heLkL$37TGtLTx#z-ZrzhH zTF`asYFPUF`|jiOal9?E7b5R>FI+@Y53x3}*Kl$?G%BERql}5rFmFEPbTrneG!SXq zRjvEz;kdz_%_sJMM9*WWfnG+c)^4`W&CmyD%6=xN?iw=J32@`|nX&pG4U{-JVotI8C@l1EfA@N*1ZYu?n>=trTnCr&&hZXfIe2 zNn?&2b$)sF@jl#6szcABrl?}ZdtNee6{Z%+gW1Myks?w&lP}mek=!vdCr4*SzmZ#C zH23)W&Ag9^S&IJrHP`DEsFMWU3E0F)pafoKiNfvY7_zxzxvI9iKXm8$@c2BEFT_#B zDGAD(?_Iv3grTkCu@VlPHQE+4SbZBu$AzZQu^|P-^>)L<4vnub)Lt)^@R0xb%+rKs`eV?t%-Sc9Ah_rF~L6h@= zd{$ZX6c7h!KO|{0#SvY_UWTd<5^j71qQ=O9y!Hm!-9T2j!>$c8e5!~sopWC86{^Ku4^78yP+#V zE!)*X$Ry`NeHF_+_Uk9)MC$E%F1?rtVon3C0A)^ErI-sr+F-rgIqr6P1A*t``--AN z=Nx9LNNFj{4hL5~rW@kZtH-Tv-Ac8Ux+X9tZvnTy|LT9Avi_G!+i%}17CH`Xfp1qE zND~;?NVSwMjU=T})3TP5p&J&5*FW`j9I!sTyI7eXI$o4Q7J*eDsw((?&Xfs%FHe>} zT8bk*PK%_xOutajv9V>0YT(3TJwjZM*}y|iy?;<_w&BU4=Qo4n-W!0uKIi~YkawnL zvQDBGeyU~Ps&SJ+phDOPdYoCQJ>G51?T`o#b2eI*d-LS04ILvE4Q?wAdAF-aH_{tx zY3p1uymYfj9fzpY!K0TiJA0JvC@v~Kyv;H-I{Z<_nR28W$-= zq=JXan$$0H42gSeBlYW`cX<%e^w6)gf*Z^}xTM|WcJrXtG0DL&j<%)Ex{n4fVd*{3 zVI!6_ca-?Sm)bh!R)HeD&TGnbphHM(eiHJqlLPQ9(RijEc6k9H%)~{)C_OemOsa=DY`0CQ`RKMHjpu7-j z3rbluirl(FphWzG$Bgq!X%WEwC zkV2^27!oyw2Qn|Kp=vaN^m1^ta+DXY9~ZVG0X7w>X4nN|&^cG&_6(7-VM3O#n#+z@ z#8Rr$O0-uIR%E7I|HJfw?n8?@Q;0v=SCXID^I>gL%YS?2zyEHjQ+w}L zi2cDO{q~|I4mrW@1+J~wAT6XcHb^+Yqf@UDOZ#%fs}{Xm0x-KuK_Bgu_Xj8aAOeV`bt6#H1O(#LCd&W zSi;T1Jp8z+S6xw}kSedd>e?pNN82{*td07?15k_uSf(Q^$C3%y*a1011H{9@wUwfD z%4%slTa&$a%ij}5W6w)}i#1+;9&P6Dk&rQ+DEr(!)#YjD`GNCwkL`x4g0xe^_la%V zWVrq<5BSHs8s8Uoq2R4)qRGWesX0O6iaFsw^Y=r)#?#nnfm`v(8Nk&r=A5;qCq0uv zDs&uRAOBl2-QWD4axRT~(i+F{12E5&^^5C&jmX`TtPjqU&c7T4RnHLTpAdP#I4FG~ zEYfmp=J`;4G`1{25bZIei*RCdl>y>&r5@UTcGbUOkG}`t^0t9>Ap2r7R|NT~**1OUg|Y?${U{cex2Y4H1nDEo0M`3FDL>c`)aq6bH}I2` zzh4RGVFP62qq_uB1h6jf{AspbIY)pw2!@C`dlSTsV$~9sMgYPEfgb>^ZU&CKV;U@H zH6VLVK>Ko8r2vnz2-WE$=|(?zR+2;fI3(3Fg?j^sR}pb zG3lV(WzJ}SK%u2P-q7kDJ)cZDX@t9Lu9{bhIm+xnW(_JYzwAVk-XQ5V(16B$)?G~7 zif0{BsJd=+53GM@!F-#G0c%UPQ@K)~@bEn4^|}L8J@NH6`m;~HGlb=HT+&}Pj(S@f8UCx}Q62xzqyFy@ zLyjDYYONAuH)FuVgKmW#gt+S^7`pbHScvp^Fe)#EL|uQUE5M2IYD|x1cV5TTH92#n zq*Sn%{H*Kk#*-(`0c2bzs=%b)bEu2?yadc#B)91eZ=YtuSu+wkJbaJ7zT(O9EVNfMeXkRpd0VWoG>^PDZ`11g zFTau;5g9x#mX=5Lv^It1j@XMF$&_2na^YDMZ6dZzU|j)TUIb^<;E-(roSj)XS&NO3 zQr3=^;cS;XNQ`ssoR! zPDX?w=K(~*#|#X(t&lkp)i}|;7Kc;N86EfDjR>vzK-ivlX2MwEUWq-ivj?Ni?IL_Y zRO5&ENv}W9`N?guiG2H zaqP{4OiK4ePr&Ul|1DVo`V6Obc59`0gi_gy3hFbfxD7dP>o_&7B*%T2mQifYoiEv~ zX~<-iU%|?M8wQUB3WIq0pMd}KhlcO}xB$H?a75iJ8T!Mp+vA?%s3F(8?y-Ie5;nJT zHXaKuD|`G_ziIWogKzMS$S52gShjWhmx=luEin;ZAef(Erec$^!LkxooF|@WIx#I zSMkp9e@Q*-EHkdTs}t7|BDS2uIs}1PKzc?fyxQqXTx+0#zD~1z;W1*sq8;TojO#%eZDy@P9l?z`o4$^Y!0J0Q{xq)~}DVg+L|hIy51E!JHEYY6`SPznAT6{n`oNHD z(Atc~HQxQ)`mOy9I@pp!t#(C5T~?Veh90T1S2qxjx};X-ZL~bOe?ZpFx0`$9XCox) z6)y}bc>IDcI5CWxn`s)_u69sAWaMktq4~}&W{x3;my%wr5<7hD3eQ&j@DHAT7I%Fk zOAIqCt5pg+)6G#I%{SQ{TgZFycCg~fLOp`$;_HNRW+>`Dxn;AhZ`_1xGKO3W_i;B^ z(NUpnKPUG$u3zk2H}o!r;{}^!?z_zA&cD=_oS|))ULc?H zNKg08$O(pEO%ZBPDkSZE^J95w6`b-RluGo_n~(f@xTDrLke+G-WoJ0qKFI#k#xtdm ztV^%eb&01$$7go+ok=}pfig{&qF-mo`}F{+zqyjk)=?)%6Dm}4z*-gl@TXwz@2x>c{XxN#sU;{ zu9!_`Zb&l*h3P~VA}F3LAl6WQ#>f3O^F2eh*ozJ{;7{LozFxdZci2Q~sBW|Tsrqjh z^WaEebKSI5Auu@MnjC-?H?$Z1LM#mJWM&p{z^~WxHMcfZuCQ>Yh8s zALHAmsLAJzZ56Mb336LJ$~*^6uS z%ao-t(>TEp={YigsdiSm)pZdP77y=;3eI7gtdPOM=55B za7T5t0EW_(11LK#{+>Lr`>sxp0*>1$x?2nXU`p&L(C4InbF~lqXd7`T07G${8!wf! zy4Y)Klp=o4Ae)iE7Ej}@TOgV7Hzdvx#>>D%=E0V8p+^ZtQUT-Rb_0n{XN#laPrWs0 z-{CBIjOS`pv&a;o#ykvzfII0QYH`=Jp2YX3$99U(&f~f3TUm$D zUE`>#3MBIh%#%7q=C3Lv!lwdupd+52&#Ce+lZzC8II~8HF)3snsQlgPWo~}(rEPcg zl_vSLKURcDID6kUrv$Ukz#_~cP9j$l(*p@dYv83_!!SD(ZrfdS-{W=Tt5@ZyPpm|v z^kZ^3OPuf=(6WXJ^j2b_1XUSSjyipvzUZwRhuLmpiss7VA>$wQ7S=$EZKL*CRV>PP zj+RYxH&%HfP~?Pea%#BJ+Fv&GzsJx29kSs+38ve!yHG{fxGS^gyC1orpfD;U(e;_U zS3~fw=g~YCI&oE^tL|2+b3T9q8vz7Z>@7=GrVM^2o>37RLi#AolveQ6wIuj(1?`U4 zn(c2}rS!r^&dahDb12{rn62fQ4nk3AU7#KF3iwUU_1uNRSMhs3FeDmet{)hc?2^36 zn7#*nGM<1QyxH%}|E&F)%Fj%Uev)_}w+6GliDeStMyP~a8Lz&V4-ur{@`#p?Uz@bw z-~S-|()Fcreq?ghxz%O+;5|KkSDkd;?iL4yX};~?qww6}&SxMsB8D7<)&vOq)yZnY zNm~Q0T=yvhjc}_1ZuXI2(D>j}gni6;~dY7~+eQ|!o-o{BpI->^2MzS)n z2+54My4QA9YUd)ncZpo~tkV?7d=~3ARCXm?Iaj{Gc*lRIy7;;@CnCSWBxU+^>OBKp zP6|ZAA@cdDcAiRLKm`MI$L+!rTq~b;Ug{Qw^8Gz0>=oFnEm1;7 z;iy~2q1Ds$fF=@dE0QfgDslPcvX=>x=6;qW+FNxUD z{s2*ctgX-;CsyG%PgknY+upvQYN{xWsk(oaY4|F?)*&)fC^Kl4BtFXX#tk3ctpydQ zf3iXxU0y%Dy4ghDV_);9s_Z@6HW~{6i!e}JtX2hboCau?1t%RM#meqSuF<>g9WYDV zF!q{$t#tO2X)7}RLGfV?1s!UY$ZFClEdQV(@JiKRAVx1}+j3BaB55 zJZS5St3|NA$&U*Br&o2d4Z(J*vS|0^&c!{m6CSGkBWi_AF4^YEc-IG=Bflh=ztYsO z&;7N_g20B}zp)V`QU#!tnOsXJ@JQ$%BO^F^BQ~@oNP<8=hJ$yRKY)O=`#D5pDWSnL z;+4n?3x;#}6&=(A_GQ1cX-mOCtMPq~g%2K_b1!>|Ew4-D+yu$SCbmAvE222bNcz|e zl5B`5J!>=O$=1e>(Ky>Ye+trsidg>OkwT8uK_bu;8FCR}Yv0Af9sG0gUc`+c{{eUS z0WZM3gxE3nk9$&BI0}}!v7wzhY8QxW-!#U?t9x+LWlHKXYEbPXF3_{=Y~Xev;G5qO zj2sA@pn4;BBW!9|$*=)%7grqi0_OQ^yCxrBA1{WqMYQDh5YzJL`g_(!Yhuwn7W^Ne zYIRVujKlOg>|iKRUa+jD_OmrO=?2p5R|@pV$YZ|eNcR$>pS7KM)n?IrD*FtPUl1QF|w8jMLH~hVFWfHW`wiu->wra_79^nM7KoN3Mh6yj@|Hh-aG9$iYqA$2l#L>Qq)X z^L(s6f-C67@<(6!iY>UVaHnkssX7%<4RimwmqNV1lV9U-r?B;mVKlw?aDmv{Y?hQ=E=VrvFqE zBt1sGpQIoe>XvZlm|zmBUUqNA`5nPleb-LB`m)aJpxNRzb@gASQ`N1#~?qp6zs;j4rekcUo>UTc}k$)uXCSscZyYeXB_*e zn{rr|9TMpS(ce~TfOmTbh%(x#J!1q|Q8p{EFnW*w90IMP~@q8m>} zKU;Ws#Jc22iTiLS#SGl@8rYn*8D6r0vq5Gfi7u`m-PxU!U=|yA+saj?KF7Uequ~qd za9rtC)h+Pqmx&yuRIIb~f}I87C<#BH#$89Iy8*45Ofn5@1RDxf&H-KW2;`vVE<)_? zpo0qzS9Y9@5|GBh&p(PS<%(Jx?@`_X`md)%nngkx!|)dF1`T@a_s}^icYA)#eyX^YiJ+BNQ|S^w{WmH<~oXglTd9hOMLF#9)eLSeIXM&{%usFy*s5IIu*a5 z-NTxe4ME%oC%*U^@Yw#r6>a{}y~F^d@Di{uk=;f|KJ*dU3OB3>tDpGdo>5*=UgEXw zAd|kwPyUYf`Zvk3<~-|pz8R>n?O<2B^jKaYBa|##>^jyiguc^5)08p(UU$>3R&wGQ z_uhsRRXnkLkg|gU;FEz6m5<8U(ZfEMm{~*X9~`yt<{;FZ@BxQyhr;rjA2SCX9VZ@L z`c%<$D128>xfggl4JL~;V8ie9=^!Z~B$ZRe!R-C}_9LTxl+P)1vfd~lruM%xIHpp2 z_tsYo#m>&|yq$@YlT@vcVt%6(&)Y|7tIlo!kvcz*O?FV#!i5a8Qn|!jJO6y`_J@!V zMm9i_F8iEL7&C})9wTm;>J->pFe~s>5N&bitOdV*yVc!3A0sj9w);lfaxsthd~6i0 z`i~MudFU}tGKLSGFdn~D1*Sh-Yf{O0`c!gc>aoMIrH%(fW!~H8qHK)h zi-d5t7YVz{1)|5nvOCv&Ve92|vlARtu-P z%B^!_T3Y6-L%O|F&|#+el| zq6;&pH_u8~_Pb68`iOn7A8Ee%*ihc-Ky%|N$5Z(fMcy~5pg%t0IXKHyJ6w0;UH+yM!gmad%bG-7GoLSn|7v zEp)y|dc6_vnIZJZD-#mk^9nD^6_TfTS}$H4<0>XS!_+#YqrI7mv=T-V)1eHwZcgS+ zGNalQ_SYRBtJPOe4=|Rono}~1Oc;)|;CULp!}s?6ly^;i9>YFsFvtAVllfO~-Td@X zLNVENJ~kAqO?nTdUZ*>8cRgpBx#!ZX;gf!j^ab!XngHy>o0Xw zNVD+;%31qDBlfNA=ETr_bfu-QN6CLYJN@q$!3YurAhvoK)rn)O5U81X*MRD+*bs&D z%;;Xm!@F6tRP8Szxgv{|=rV=c(oVgatoH{pbT2(#mO9DL^Mmj&1je|MSTg9m(=NlK zxqO@$_6`;=w&jGsFfcmnB8f9`9R@Gx3noY=wk=z3(Lo5ce%5RHI6pmQ&LVZEVS4?P z>{-sB7BwGpEYW8$O#bhBdU;(K1|L~c8LV;qtAw49mo0PI$k$?4sI(~j&3#kpgXK1- zq`6QU>&M@L8XVx@1z?>KYjlEjr$;Jmzzkgc6?%?$H}MDw|} zShwr@FH*H-Yeei}r@5s6k7R!-zyN|$%~h-7{p`5njW0a`@+-ktx#p4=NesQ72>+4N~VE*7n(KWzww~ z-;&yZb5#^LHo4@6U><}mcTRXnD%+qB-?KXB3BRz9PPsG!R! zbvnhVX3Mc;m`-hNj?H6hhRb3|*_38c<&;~Vt*S|>2a@}g29qD@cZ3Dq2|LBwLjay= zFE!BjIJKaDgqzgbH1DXs_8B4KFBmD)krVdn2LJfYd&o+^mDTt1?Jm&wBX8tK+FN5) zOMpRius+p{WfrW?xX8(ALa;2@rvt8Qx60nxdfeHi;Gcr~{Tjh2Nb?kWIOugKke}(q z*bDHKAdzeWL@<+M#8=PHGGSz@;LKceE2p0-_#@5sgAI4kJDBvf+)#k~qJ&1^^21r` z%vFXzCq@?*O^~D9VW!jToQ5%q%=bMX(zW$IpB2(fmN{?Fz40C9Yo8{M+&GQ5uipIe z{h>`lI%v23CwSWbh+_7?{O|uQ&g}n~^5egI@|MF2Xjtu#Wad+Vrt%md5rHJOnt81F zLJ?bv>7!YmPfxmDh9$hl4y?olVk1B_t>R0tpXZm zh~+{VYgcC)DboHxtS-s?4(PMq@acZPZXKX(-{0eSaF70XGys2XXLAz$lg1x2JO}Y& zdNZTw3lEgCyJBnVrMr)p;Sf-rmAR?PzRb-_If1I>OS|QW7nkq2+M`R%7D+;&ca9OD z%|$(C%ir3W`DQ+lPP{rXx&QC^?(`e?{JeKzJG=pDUhF@T=CQ^(!c0Cs>Lc)shMYnp zIl6TxSAS?NcPA_|G0p82Gj{oWT@*Fr>eB~4wV(Hdk$Jv^BF71CGI(`v4~%AO#uC29 zTyBSo;-N^qKw3NFar*{>cr7Er&+GpC2Uw(6`PXwr-%p=VoP+QpbeVh%Bt3{K2!8A~ zrx+fAQ3b~U9K$F_%7ZW?ol1n&#@w@zL{*#W@2Z-ctdmQsh26m3)gDLAXfL%>svFf4l_M8I^th@O}qH3i+e66KEynnh0sjtx+I=5b$z}*^0DG zigyedC!rtfwd}3AjsyKEB_Ak32o}{(VuG54$+xBUp;|kA#$~PjP za7-lxTW7imyk0|ez07v7&mE}FLGUvZe|9S9WSUBLmwVK~eKU`EY&9MUDRNX4xTK?I zh0oX9x=Kr9oM4u*U<=S{{VqZr*SLb7SVuQAPrw20iA~;Q%klK0ZUI7umbdS+%y!Q7 zS(le1tEeTAd8@5VVkck#)0i~8fY8_RU1}?g%HmETX*O7L^bLNH!uiw^s%h$;H}-n% zk-zjfIHT-cyXDKsTLh)FNX0=sC1$L)$ACC!e8oUoM}Az;n5pb_<3}} z2P58i`XE6LMh++bjB*7xhW4PJ@8yf}??Zx0|C74@d)K?(ab9OTG=1Y$dhXMRE$~wQ zofl(a$re-Tydgo?ZxM;diuybK`aXcds|Lqiiv4|Rbe`d%SeIc84xmt$G^wyLqqO+oEh4zB6L(9qF>+62q=HQwPtQ-j6I*MzG3a=22iRMul7RQh zM~rHY^#K~C_hZJug8E_j{;OSXai8vPpT!t;q)Y$Su=2j(x^pRY$<7kTMsh0g+tIr? zSy9vn6vNFL#5N;)DX|;#VrAXgB-_*zGZ}h!2mU$F$!lYGeHwe9=GM{re2k# zaEaGXTp;**{Ja)@Iv-2B-*KGX$lrl!1+&xT9@Pbe-#U0rjxQ!`t98->%l?7tNdK3`IUa zp6?!oY{q?|n91-xynos6(-)X70VRV`wPfAIbm8|8vKNPen>*thqbo(wi9Mxs6;%+)S29X;q;vFDeQ z=&XZ@@YLIi9z5}n(^U$2WX~}+&cu@@W*LJOOzQ0Oq)3GWOs`p7R5n`@i?V8h=XHA_ zbpiV_m%k6^z7~YmKM800u?seWsz_^AM-Vit;BpV1CwFHhuQKe6bIVf!~j*@ zC|fgg!_0V@%ZEREyl|1>d-Nlh7v|AFDMG^Pe2;l4qvi!^e{f0L0n zr^_wD-s!PFX?*_7Td=I2gl-4bza|kDGPs0sYj_R!1S@4^JagUd0^Ot>uRO30+XYkx z7b-jl&s+Bf_;B(Ctp>{nOK;!Uxzi^t$#76L5a!-w$_mExZD}t>sZhHL7t)Xjk^H*w z@89MJcD~RUDySV2FOwWgwA^NXyYZBgTRgjl0`R07bxUnzs^9sB23mij7FxJ?S~*uF zaptS5iA=_WEL-h^v}BdjlTWLWFJ2SN)o1&O8;tXG=n{aUY_4IJAmH(=eJtJ^5WpiU}O$SHHaXVWMLN{{F6s&dIqK2Dv! zdSa9D=!^OYmRv?Gcmk8c-f7Av@DUFW1bZ47}JzlT(v*J6t1ZrrYnh z!neo%fRPhOb%+jB^y3aF?IMt^xP2vVw!g(ZmbkDq=R!9LKjmb6SBSHG^js;~-&bTu ztjhR0a}jLSwoF&L;*5FM6Cgka7Ej@<56n!OXwB66i}Q9YRI9gNvBwH^p+v?b|C1sA z?R@1XHbyRIU#~yPA~5h5*6y-HJogI;ijn3HleV!BgE-0%WHTe!u7yqu4gTz%D@B(A ztzNxM?etAm_Dw8+A8gLXqp1RvrjS37{bzvGJRNI5MaI&X*^l;e8#~y)P6bx~6FBC< zSO+dHR|vz4J~6`8X1k8vW9!W%Y?~^t0se|M!p%+AQ_r?u?BLB%g6!Jw03L-c4>~Q| zbM*HJ3pR@{}2#@%bueB0dAb7D%y{b{;!W&ZqS7`eG2w5XjFL#h~C>YqKsLWMUy zpeMl*q8{-+yI%{(Yb)LV%twmzb4{jM*$oSD>W4;jpG*6f9WpN{tJJvE0$HIX>PlZR zciwS@7o5f=zTdrCG~z1lJN*Zu{dK-=hLiU&7fgP-xD<^VSnF7%<&gE|D{rCeZMR7y9Fxu{u@9Cqd~n5yl- zmT?k?{aFRr85T)(H^6_n0is6&bW!dPBo~{+hHOP;F{8j!m||Nru|Mk)2(+gl4Ws}R zX!I)rIfJk|*-2uaLx5;VupTfLNt;VRmuDcKI!0IpulI7k%?HL7!_9WJT|kkSJLR=I@CRsJ(Pc^%!~To?{G_I1 zEvjvYI1L-z5Ey68ofWXmW?)&r2DlZ3%?;)s$VU+n&TcGwZ$4MR z15o>z;Lh>#o+J^ht1z@H!wu1>pIun;mU(w;(XMHKuJTEh6^D%@ht-8Vx0PGpY;3Ii zRrD{zhnwvGZvym>A%%_V|ACkQ#1oTeUE~;DIul4?bkg1}Ot#FPt}#ZC0L6N8e_ikQmzhXwe~vj=qwm1>)6UfMclrPQAl z^|nh%-#_*~^gl-)Npm1?L>L(YS^wSkbd^Z1_|=Zr^FSqs{~9pHtPPAkSl2v4)r*UB z|4r0u-JHH0V(p^hm)2WhT#xZtkHz<=lE(6mmPIgpe$=myEoFNA)&EcdTV6A;tv97( zRxFuFN?crX2)VpIxV>cJ%MYiQyS{rHu`F*7#Q!ii43it-`>hGEY_v;+ab7Zj@1PhD z8<BjN1(ke(_RM?`%E0?B3+)uZb*uH1V#9L(Y=F!-mr(DalYnepL zr4eB3S@Bt7vzHDBUdII#67%rS)+8d%GI5w`*{UEhs+(Xw?Nzt!$J)h%6OyDXOhEUi zt^!PgjqR*Y{;DB^$VtN}1&_@-w81A887{HifVgGSY^@&s`}=YF0$!p1uGDegb2^ZC zYo!$77gb`(yULcCsvApF^vAQyt;+(zRSX}bH0BW6fUcPoy|#1ss{MG=nKyJp>F)<_ z9B(1+^jVC@#0*%cAb&`Kj(*r5h?^i`0>(H^gSQCZKON9)@cA23rzN~{A;^2kxpwY~ z5HG&ND_^aoBQ(Zb;_TI{KzAccDWz@i1@-M#!&OJgYHzz$zVe*sdz7;S6o8+@pK}Ed zgI8xqB`1}=H;yo|O5R!QRfaYnd74o@)@11&G~xdK{pOv-sje))ZbT-ZbW#XL7c)eH zV#x(q$r|W5SSqhPJUFEB%?WMJtj;yboSAf>SYI$Oji?{K9e?C0R#=(ms=pRWNu3K%okZ>e6e@1NqAy zp@kvSMf>hAG+ee(y=;H~-A)KkFvJ1=-y4poe+u0V%GCddT?+nt{qP%){>k`3m+uuQ zrGim80%saKac!TIp{zsK9JnvnW4WhFwm1OYH$s>@j0YX|$}0ufd6FaUyFeK1&%blc z4W>F*Ae80Y!4lwp1a6ifHRXTlU`v~hIXQUceb7P>Z+F%8qDY01*XmQIZ}=!*VklNv zavqj;6}AtJ31p$#awj}bqJ?L?X>dwO1-L>;!7o84+?J|Q;_SO@P8_Zu%RGc#Dhpuk z!?bWAOhbx<3&rCep|;6d_f2&b@>(&=ob;?K!+qk3VPmOFGvyA`qOYpsO!L9 zPi^x{>b+QZ(l{KfHpNY6i{pNvA4{+Rkl>md4h7le-3}nuwf9YPQCQkV{GdgScTqaG`)zgE{hW|bL+nnE zBa<6i7~3O9*EGd_%Ri!DQu@c!829N#OZm_xyYgS&06VI=M6L|~c>ThUHu=&x-9uV} zS&zF8MmmsQB%SML;#fjlkkQC;#h8ZV{&vIbr&lk~B>GTWJ&N}x# z<@qf&R?M`B9z$5%x9h$0?fnVcUGm~FS2q9V1k6JYOV+^TA)|mEbd}aOl~Yvy#XI1c zf1m9JymI8wZRwh+vZjHyqjzOOyI0pj+hRF;^P&T4}rxc=L`}Z;8hJ9#R~smG1-ZH|I0_qx4oB^p6~EtdolTFRg4%WtQftMSyyCIFcW%B z^*}@Q*|}ObYmYsSA2X!RN(sy6NV`cJIaJ!f(*wn3u%IP!H}??&BY& z_W4*!lyy?SKHGZ-CH^W#Fbj;;HFYobR*c)_@f~SIaYPzj{6UkEJ_SWUda+*wZx#N-wvS}?1u=qWb zXA(y#w%&Tk=Hpi-L(jKMmtN`#EQPSYyy}h+=^w{dW0#6)a*S$PdQmvn*j>Gp=rOkK zch8v|q;-k)qpmEJk2k$fFn!b7V&PnB^Dc=sWAUeZ>;-dWG5v`5`b+&R9cCvs2D)3f zWHNWJvy*WeQua%Jz&ny=NxI27>mZHD6P*JQ7xHm|_f}y5a!bwzYaYBSH+q90g}$0c zn$J-hu|Kh{&f&9f9{5g3JX)>UKmH@ydSxc)$TURy79w~6o_f5dS}<305mu48M5TyK zq@u=I_5+79KT3(dIp!!gnQL$=O|$dbVjs)YPc70-$J=H2#aIv>!3bf(>2>6QKaiz4 zy2^!+Wq!{e}=)@^7xrN-b5;h(~t5?@|KkpvJs1029-(J`DH=O9Ey>EY+9DLm7keyRE z)9iL}tW+=5?n-m>uwv8Tvu51#+%rO&%+F82rg>a0IR%lgduc|hK^`=1-hrl#IOMW# zhY(#?ioeog;Y!L!wH^!p{WQ~kV)M3u)0GwxG(MYK-h z>2qCqo~gGPof?2C`f{MP+^?oC$8l_}1JMIzOSD}chPlFI*TbQNe?Bv{8*`aYq$H-f!6!bRt z0ZG@-v#KnEGR>sZPwVrX{)aBb^)|1G1OIUO_^Zss!v<}I7I0|#4)WET1eONKmgkS+ zhzo4+m+&#*!ZFKj!7?24|3C&^5H%w!`^?$y4;69{)#0&oO@4kpZsY{dl|JI(Tkq@r z{SB{Pb-B&2_?oX!eG|ao!aca%pl1l9Rvvi4ziHUC$* zlHx6&#^kWo>898d!C!X2DdS27miU;Xpne52k|=(#>T-&pXJ0p0fq_W$Z^D+*Kg6*BQ?Sy#|mTg z3bPrtV`GRrHuqCB-@8^>J&pgZ#gEF>`&-yaV|;^;J24FCD(wjZ3n5U+1OVL~w609>L^S&u9w*Kb3PQnOrqn6wBq@*n^_x08-b2J}|7c&JD26om- zPqIT9DJI}8*rgno=V8Z;V_90~TF2jT{wP!?l^r8jl%47)qXwCccZef_oAw4)$PI4s zuo?MAdU`*@fP}r9E8bP)e0JuwUnY-5VXW8q(S-2I)S~S_At^Asvd`pKznh|{ukZZE zyjw6R%*>w*t#Yi79F*^Q{xcvUNFU+~q(vYl>&!Ti^wJ24vkczB0AU@VGd@e=1ATsy zx+1C5bbG*de+F&{5f25u;1*ICSA%BR)*mgkqD7#K*l>Hk>MlCQZ|DaJG(LNRXxxYF z{WN}NUn(Sjp?{*)XED@RLt()01U|zgA`cJa6vMpxCvXgt#sPnkc!T!A;K%LvdJEiT zjZ3q0DuUmZcUqN}wVl7M;G~^Z=RFfo^J!+CT)AouJHp%>LyPu3=T8mVw=Qb><2QW^#Ha3W-IbdT69|res(j-jQ zRG}Q8Yvq?tED-%Xr|pvpoTvNUZdW!bgi`Wuj|iny;$Aoe>eNf?Z5^Qb&5E?aiM%tI zIC?JmcbR1UOs?o8rwERJBAVJM{dy+M)ivOb>wXE>{fUQ%+pau$P_#2+a=WK#qj^py zB3cyN)3+tXd5+`$Tvlgr38MP};qLjGHcKDD>7RBeO2Ia3o8`*0k8Nq=vI(A~30}s1 zPMwL^G|{;^w(cB^1I(kb5bhjX+K`OQwo+ zUOP{~^g0ibhczb^D!wEZt&|f_Y*mlcS=i=Fq6Nv=IH&+dnb;LV8|#pTMc+!^R#u2G z4djQdcN3cpS?YG61ml5Se$16vkxIgnzrmOXH`?WGuY0eO({oY+j-NG#Rs}j?rxZ?`t^|6<@m~Cg(R}P!SyqI z-h9vfPdl||Ep#yDdFCynu;d)@%XZq#!_=jRnQU=uXDEa#WYr31+UE#0uhvlQwepOe z>@P_u8Y`%Zdpjqjm?#hmpi$6WTuHP%?YqftrsYhTdMnZV{iJY4ikn1wlgofhgbid1 z=-LEZjvY*l~cJ)=W2UI&gld2FFipBUb_3w!YW zp6ko*K%-RT#M`_NeHZ_);@tn-Z1SH)&i~&Co8x}kaRnPOn4@TWP6q7ecsCLijS%IF zUVj$&JMp?=t4&S21^W$54hb@r@=*yjxBcD%5KQB_Y@F*9;A zF6Gy?;WnoBU~QXOcIoXiQ+X3x)j*snBUXI@e&WB7J=KqITpiTyeeio-?yVQ4=K7FP z;@gVyV;L$D((7ev+#p@p1ac;mdIc;?;hfiK6?%{4m=W~Ne2z5!d0T3gW&~%}v1_00 zw|6?5Ij4WG4AZ24FYGuohrBARxV6UspQ<}NDm;Afi@C$ujU>(V$jg!Q;RZGZo<#e` zTMET_&K(bssy)`cF^adAdXtfxd7N^zDJ{;9(^O3uFQFubp|8wh9~1c5(83$@IXh`N zFUD$zZuGG}6yo8>k10vgAt~F&r!Adj*9qfQ*P1|Y&m*`J+9DqM$sWXl_2%jg zm3v36)$%|2yiiNsMGjmm6ASXRJu7);a@T<`7UH-|p+^z@+U&hS3@HwnvEGQVGFA{~ z;+h)zG+1{!gw3wJ__kpCsrSv*3DN~53VCPG`BMju$cPt?T7)*;lla1IY0!8GEL1ER zbu^aqH-wOOiIK{pFZXX>V zwcV?0tnl#H{vDsyeY_Wy?&*{k?N7R`zT(6%iv_y>CfaeZaT zSwt*%H*m7Yf>OI+u-XONzNUvpu$j1$gdL{Cl!``;v2CASPvux8cSyk=MCjZ1Fy`;i z!HP<>#e%W6gxqsOxU%0`WmbF5_Py(uQB?GOoIX2jyJ&LYtC67>qhA!^G~U+L?{l!d zDVa~>)WRv;oC}4KPD8d_bBn<5=YGtsa%d;x(0I&0f2@jf=Mo2OdWa?j_jaE_mcYD|_N@dtQ%~)rrr|ImuhzV4wQ_g{#2gt!t0nJZKNj#j-!pcS_%XURDKq}(+m6r>RFK;P znT(#xSv;NJ4@5!F{l`PqcQg-Ed-f^wz${KW*t_3mRGX{F3~39J@;j_^_RaLUPNdxz zqhgV1h#z26iQBc_5aJ|b-EdLRishv!H<4DV9A%?j5GsP`&r?6msC9S{QuL@!IQ7#R zWMiOZl;XALxcc6+GgG$8O*M{>Zyk}p z2C?jM<@?PUSRr)4!VS7wp-lbN4z4P_^nQ{A+p4jOUf!%>w$rKE&9){z$OTf_Hdh1@ zQk9l?@h&8BDz~T2ikkzZH?2#Ve(+J1OSm8}q0dsUboSrotJ9;~aYqz%;7rvuY{lCl zD(pStV^;_C-X?y8krKebYM1vte=z@IY&?he4=)0uc8z?T)SZB# zpHikwMQ*cP{L&bzoc?-x;eJ@Z*Y?!6#=Fdwuxe}^c!w`yTg;8lvao0q`n^ApJ-Q)Y zf*mH}gxbB=CQ9SBX9vC(3ptzFx5@R}*YgnfNuYu+X;@nt`zar2P>j6*nrb|_xyRgH zq~dFszG*Q|Ho?o-!o8X#6pU}X#`Kuo^*zw>R)V&-cXe+O713iENj0(rYP{I0TqYXWr*e!fH)5;}4BYNe3`{VwZrb=duitHH7SI8oKLRike%u`9HxNNt-P>uBb+ z9?qL(X^pzPT{v$laC%~*bWF(9C?Y-ukt0O{=Z(o##=E2?80|c4!m`~UAwDjGj_Qzc zq2Uvs4;|JB#NTRYNcxfRK0z-16VV_{bPn#0rg%Y7E>QR9KD+*Y3W&&qr)JKui7V zZV=21=?pk&N&I2SoBQp|tY%G=9A<=*MIgr`Xcqv=?Xrm~V5Sf*5a@!Rl+&EYHjoC- zd-mQ1h`}2&j(c`}`R0px24SYa$e^2Cji6o$BxvT(W9`eKiL4payJ<6`pS%U>2|)B&G_g<+?Uvlx?#+g6?&jJ3^OV zT`(vZ(iNgAZR|mR>3GK32_ZYI=_6ucv}=DLn?~4iZEmkPcI!Nn(SSFLCM1ZM4fQfo zKA{0dG8K!%jvrtC12NabYGG-X%S;uEdU`K+R~K!J;+52u8^aY&va4?BOq=0&mFcpu zT0bsxKAU2$kADm8#x2y@4V49i>dac<r{j zG9&P+XpM}m>#9h}AB)z6T0&e;4sphABf6ZRot>qgv%@#H5=I@F1wy@CU1lar4Ks-0 z0c6n!^q`sF={p!@ULL;M)qST28TyXpWh$3;x7A?I-v6jGd*WuU8D5=%4o9%}k5Yc;K%GkxVAw%6(z?n+Bb5vChRJ@B zc9IEeJQs7<*LV2mR&4;gKH$ngpocG0;4LU=W~0^D@^$q-XQpw$e&V)iMErTKY0_uI zq|f4?PUnQ(bfJ>sV1h=1^tPGAW?B%%D|)!enBpW;O+7ta0oU1;iG=i+9sbR~Kb%K= zVXSDKfRe{yI~n30z2hu>V%O{#bqvQGY9rB5on`NWcYJr6r8S;niO=OyDk}g&a8rfj#7}?YPVjZ3wC$vqX#DjmpBH{N{U?QE(+HA$fAjN7E8-<|cj-9tx18*in%C2}G z|Nf)W_6IOGoBNvNrdQVB$PQw5%)$XcrCkTWJDU9HMmI>)6Txw9Lq*wlQWq$#V^=em zf>-f@9-4bJbK5>mu2AsPoCx#<`g5>Xhi8$3$G8IKGmdNU7=WeImiXCLp8_Ci18bkG zyMta?oYU~R-X{BILnZZlO4FlQPC6$a>mm}4t?v3*rp@r9M^qCL5!KclHE!qRSs$Y;^VzUa`valsvlP*4oOerr0&lDzD2sa(FbNm?Wl)>7KYXim>+{?> zt1D3%>|Kic^TT-d*(V(-P(7jN?WUa?Sc9jcpS)kWaMn1L4J#ArfNrm&qKd&=bQkL( z*#D^R#_r5^Vqn5dueBaMxbc|jD{Hy4wQZkT#Ee@X`}6f&zoqiRVH~@S;HpjRn(3m7 z&~c1X&a367141y-CMn66eud17>d$PJ{cF;C8lIi!%Q}#F_Hv_Z%dDkbH9v84!;i!KP{1$vFF>SuSL%;^6sM}m@kNm zq)38Wu1Ijcb#5n^PMxo3B^=j{;T^JB`1(6m=ybUbR51VfC5qwH)xADnMp$+)L`>5L32?yLnaG?OTXLm|oZ`H| zi`5JPCKoE05xE67K$EWbJy8D1EIv8!y8X#36WUyH9%vipZ%{nP#ElgW{FPVC6MO` zn-;=doJ8(!kXF_-W0>=VmECm!U+2M&fLzb}M;+wYTw}+j5I^l* zEC1Y#;b5-|m7w{SIKYU=;vYstXcbxz!;Y)~EEEd(C>1#n6J9LX;|U!cdR zmyTx%wEyt>q}V2wK2Gg&c9?o<>vQoV<{lT+>?<}pUm|CJN3q;7oi`MRjP&XH=Z134 zhRcjh_YM7O_>sI+DHswF7Nh7sWv18_e$m8rGL4F@%E@GTEd&v3Ygn#`;FI^fH8V!L zPSI*=+LMdBSCt#;8}GW^MWo!0=WpGYn~iVGOAWvHD+nfSvKQ0&7kq^p?+Zu@M^s>A z*}ME`jdsK3HS&twJ(GqLdKVxhlT8;>&tCb#14>;#pjmjvNKcMBm!Csp47HQ0pk297 zxck8uRP{t+6pjpP1?x(DgN z<=3et_x)IhaHs~`l)R{Eyo5H*i*&6TsqS3eUmtd^qW+3cPx6y;ONp&IbG>uwE4y7* z^=EP;31pmTaMEd7R|iT0t&0+w*AGnfeE22T1-(^>^Vs*eFXCpPb%=*joZ8@yR=j}F z-NvLi1yDK+PY({EOczl4Cl=fBJLw`%^x%r!Uz@JD1-bbYRG*Vz$CW6fPTo0rchzBk zviQw?oON@P3z|O8>{6yFIG22Dq7jympsVf0d`&SlwsvB6hmi z*o{;WcFM{hSiT3}(j{L%;^r$9v)`64{eXZOIcZ2%_5OJ+j2d&1aSVhi1~ay09|tfX z-{^JWWq28np=6kOZLBA{5J&1aZ@msias}aeUHe!#A?K#n!Wlzndq zTXOv-o+edpD9gE%mwRwQ3TB8m=8cB0HR=ha~I3Nk}?4!<6+gBb#0rH@^?{>8erouKdRQ{y0`Cm^zq}p z0L9qV)kCb@EIgC~v&Qe_PJ;>y%md0)UG6|ltj+{x6==8QhRfbDz{a32_*OnI$(8&@ zv2#PJWG9y%zrT}Z<39D|nT+}VI(gO&kbd1Gkh2k0HN9+;G0dT>!OuGeZYT-PkOcA< zvtOjWmWeiZ-^VA@Vt>>06L)VC#ib2asZA|r?LZ%-OM-^=j+ab*dQfLBzuOqf$P{~v zoGamEIkSn=+X1h+zCfF=)ctwjcuYaYAqdsBS#C>|^8!nLjR4k10W{oM=x*38&?($m z-T3SE2kvks6TP9}FSNb!=P>VsRd)T!l&MzLS!v#%1H{la0NPRjGoyq%a)F^eevILm z+iGxrpgySsCXIqsytY+56gYppqkPI*_s(7oH=d{OR=WGKZCP04GO>~xki=9|pr%s)ByxTOnK!rlcQzYE-xDck%47s|>8T z>uYdhbDz&x(Z%*le5IJr02dBps?;s{?K#UbVk>{uwO(ft-_!!C#PFPw==sb#3#Uw_ zV|S{fiq~-jWIJAl3)+ZPNuBB1G(QSoB-@cUI%c58$%!VPc^~o3ik<#lps6jhB%IBvf|eA2EYx%|MJBBrPye!R2A$OlooR3L{G z61$41(FcGZZN^uoOgZKYwmL9EDjIzl*IDdyZ)u>x=#H*$$3aoL^h{dc_oEbOT6w{;yikgB2ABiKaNOZTC|S6A5ZUYGHs{uv5L`^Gf|~R~ZZY zijl~6wX-GAT3rW99s5KSrHi95_E%sj!Vvs{(kRe4=voM(nvW~K%$1n?11YJ&zLbX% z1vlZuY;~Z~u+NO{!c9Ul!T{hO6jEE0m_8yMNn6G^A}CV|fP6&&HbNwPdCW4!9%k8@ zW7L7(5=H|%K*a#LB|8Gs28eSUU{?HrSk|ykVP`G^?Iz2uMu*L>K$dK>gb* z&a}fMVZRszAp3Nf^j90&!SfHqK!m!3T|~Vg!2}23e{;=$`)mtT#7|Qo0``?R`MA?oYh{V4tR7F@?KIeJ~`xmTQY9+Vk|4FdKv42Cw#|Y=zF$!bZ_CT}hpDy@s zW49iNTYbg-is-=%kIm#hS(k%KV_Ux1S)d@A+d2i{3CRSo!INQJ!Sy8Wxst7ys5z;c zD8H>c{|n#BUkD&PH7+&G9Xcez?gMk|-&{AJE4j?x2Tp#lKM*i?X^2wn$`z0TzZvpp<(6 z+Ye|jShmz6l$~xlyogl|aSVbLo>Sj^{x3hl)+8}c3UaYrurQDbU}}ATXHw*}X+~_T z=jhlVK9l~%2VgT`NLoH2R>4C-!J57W7WG3m>jtDAg;Dwb`P5kMU9e#MtIvQYXApNc zI3MSNawm8|?NQjKT2k>S_Y&j%Z|wP}4y*eA`Xm2pC_^vStwjCbVf&ZE#zx4)$g|A~ zrU5=4G<9dw_{q-f<`~7VR(GC>uzu9#ffuVCjG>tcr13h2h^Vxv2zd4JRIW4Aqqu{j zdj^UzyM4yN^s+DJ(yO4wm<-9%py&}Re9>BjIhvOx3ch&>d#j(Rib*j3xl}XTXzX^m z;sP-;!dEz>$|Zj1YH6Y1kt?Xo&sW?x?n{r6MjqYztBUhaw1+ zfBsz?Y~bdwjKj4GAn^$&{LMRsysfS((WM^O{K_2B(>y%q2bzX^2=4^Ax0^f)Eon~N z-G=n^=i6#YvY6fT*$w+{frRsP6Vejln=x?M)x)<)Ns zR4X=$*X~0ttuXGb`_VfPpNP4Mlu4YfvjpPMryl%305Z@s}pbGryNgeU;O z)D{Tt>d4tQP_Ap$1f@`|TXhb7E>VYTWhb9}8Tc;3VJ~EF>e1n`(RKDZ__;%vqoyn) zObek}yGuI(Z7!SkzRV(5iC?#5lKJRur5V`FM z?{@dX=amvk4R3L6xsMURy5}{PW`h(CVnlGV@cZ3C>yax<^mMp}iqKg0mtIHq-MeZ% z*F3U#exfd+-rUO9ly9!*H|jPMurziY#5Z!1O;p*pIB%-&45D3V%CnaSjSwEUu>;FN5al16mMu*Bg&P4@vxyLGsAK%4}1OWTnOA_=R`UkG?_F z>G%Z{Q(eCUHU6_5T7zg8*~c`yFw|r|^Ss%ND>3AL!}XoVF(<>f2G5>4AL_N3V3lWe zIziPKh;3NU=SdTLx%~AshpuUvA@ZmZywv?6UljY~t-wx0rvwK(b>R;m0@Q3JRQ!^A zKX%T2nK2o=?r#s(iuzq6!8E1^&DK$8Si-+BP|h3t3HB`-u0xv-U*qC3D#TrQ+QNEERit6*6RsW^FJC9%tHnWVsJxZ|1 z5$hOt;ju93^8F1XlkE?uv+&c~>)=)6H@Z=x<6%NiOw+!Nbwhc<&=IPSgIenLPs|$D zZDhk0Tx`>I@^zGR)llip+M}l!44qRcSuSBEI;T^0E7M(FWTDRj1c~a(`--R>^ca~y z3~9xm01`B}SXz$m2Gqhbau?@i}vdS5Mc?NwI1ZhJb!3t80NaK`+dEa*K#`TsILA*(ZpWq+*Dl&Pe-uTJX}w{WMD=c-nk-K~9Qz120bj_*C;jVzzy-n(b>6g@13eIj7Ya>2HzQTl$KRxT-^D7q8q(& zNYO$x{ZaIco|_M59=#)h&_hcwu7jSE-5akLPmHS%WmO*Yoa6VCB`kwp`e}=zEAd?) z3V5ZJ+{WH&H)rt6HJFHAiv3$J;m@PmDwrh@JtP2Jn^d1$g-TYBy-DkfXcWwY->P?v ziv@;2Cjq-lyUwpS7N8Di)$2QEVY)H58<$pZ$Y(ampwFyo$MatQ%I}loqZyU1)5l`7(avF9^#`z_|?oKull> zle;s1U?0G%ZM&$uy0t{_MVIZU5)GCSa6YkkPlL`C7HWL}Y=-;bZ*WB`-q-DG$B#!i zS;{_%hwTnlwr!mD?Q$M-^zHDkTmg=ETO+_fZd)G-0Ufn#kr?Qm044w|$OeS3es7eZ3tfJ5AE&8& zqVrLHHA>YNZweW>ZNh*nU`{yLTj1(OghPAGA5n@HdFe*2hUF_YQMh-FXHGcb1AIC{ z3~Y?2Ws>p@Qc?dD`RI#k6N~y-naXPIRse%AtV8e9WeSX zSn%XZM2?=iqooYapesr14Vxn$5P$CSOeD=lBYyb@+8B~1C7q%7{>&);0-QmeyTXic zUyRhzYV=Fxlff<>^_TN^%jQvW^63^lk{`;Jv>?0tqGe|qrmii_7b)PynRZ}E3RbgK z7JWgbHicB0{qh6+qlkNr-8~nsR?@h&oK`Z&Pu?}0(GZ?AvHb0G_fg1}f{$l&+?Z2% z4(r-X-u1#cA#hzas87#90!Q0$yT?@QP}GmkVa9M`o7GmAZk?}^oEkSA)d3Jw!N^@cT``*mU~1* zfb02uT&OP&UnZ1J)=$OU#|c2}o;$%?53RZIQhX;B^RD@Ia$rwX+L4SvF|_>+ zE03v-63vyx-asVX<6Novu&JA#Q6muCg$bw)OXmurEs?H=wA2-1Tfs3>cKkg*{X#2N z%#eG2M#Guk$HM?T%infdjb^|Was-L=weQ$(%cNzNf#b+>GFXni3)sycx!q^Cj{!&6 zze(%;m(H+uGl1Ju%-9^pUc-&%9Cw_$m1Sr`cO+!egI!u2c1b5KRfMz|Xu#`guJ+37 zUsXA8x;S9-ZqQH8CC*@)GK4OGHg#Ai;G8!a(BrJZO-g7I9SR1d> zy){vVo5AG$;rj#P84FkyI6+f=dJ3J$5cO`x= zd8f1(wPVK)GN(Q~uWETpQ!C5bFdqRat+W9}cd&9y*@iwpklQza4wIuyd0lR z?y!`kZk*g!*9TH8zUy0->>i}-FBkwf@y+_V`s>9_J8E#E?aEUY$LwNVCHdW6D}S#X zvgP1}>_X@FNU8NbpGIh*;BoT~{ za^JKQSLWe+yI|^NQ=F2wC2N8O`4r8xZ>*PM7s0$w20Cmb)S7p)USwq3cmN)#XcnPz zJB@Y)j{4m3@4f2iCq_6g#Ch=2b9^4(#;8f>N=04BKh}C;KmXD8;%xLi(7Q5+t%7#& zUvS;=IO4x2#=qwJ{Xg343^%LC=11NjZU?N2uF9peK0rSLaD*)c#{FapH{E(%lnqs%ip?ZptfGgrrR%tGSw%j1nP zK`FgRB(i~7?c%tD# zxK<QVBsK-jcWSNwU=LP}hI)3cJ6d{oSCcUqnJjT?1-V^(r?}f9sQ0(>6 z+1()^|J-JzH>}V~HeVi_H?JxaU@Zv$Y_@o6>k!k11w19Pu$B15Rmz0;a;x)R`bY<2 zw}p?_TgCS2k6-2njjcFW+hPSNj!vepCe4dblTQV*6)ziP&wRq#R$ja>`p(FOMqq$Z1qB&N z1aIWl@^}M4nuPcVOf5$RN;qJureqI|Dh|D?w00m~7iSc&PuEx!!9dM=Lh5Vi6u1+n~11<<_jyQbV=3!Prx|7b@vO&x8Echj#3*ft3e^ z$Ve6(J!^MdzlOHT5Qk>Aj4>V9%I6~;R3*ru3zR6u0VQ+wx(g}VHdg203~#&rie*lX zu^%6}`G)XFdu?*PEIdEZcl|qaM?V%7iUec%GS}tBIxvkVn31WOQOI_?dTn?AcJItx zV@(ZLrAI#1*04s6u0NXfR-I0Dz7NR%%VaD82`SjNA_O4+cC(8BgU~C>)u9QwT!`sm z_xm7rcu8egC+2v&K2GM93Bvtcia(yd9(qt>5o?eugaUvuxM)C$+7v5uk??LDDmq&m zKSnXBqdB!5Psse%=clF;E}!`3V_Egd zjWHK{R&=8>_=E6do_4I%{N&LALzj}P?ngF#lXR*bN=ozn8L>(dFG7S(1zuRG0ccT# z*B3nL$vu3h#weD}OQ$s5??S+IKe;&$Hp_dJ_)jhH`!h!OW;ou>Fn4)&`c~5J`=$L` zUuw@iLN-=j?0Q~O+Fr-(p44kY#35>sZOB+64RBt$gBx0n4n689v!Y8@Cj?Eg#40%> zgT#wO=?Q*!eoC5(8FncnLeFLX;$d8&Wyb&NgVa)}8ssJ`tH$bqDp`0Nug87$27=G+ z{?0>FqJ_pskz_s_gOLRvQPcG2zd1_4z*&MRT+t5hJ$!p#63|RMvV2AlFWG1otSjl(nL~bJW%|8u=MTkTjgCy`>KgGBBAJGfj{Z9{wIg-!PFc8&6ut z_?9jt9-9?-@PrfR&i0MP-f7*I@XY20@?q z!3n2=W_R=Hpx!G|H_R{RLxLIfKzLT$!PcU!2>bYIFK|5&oi3E6or zyxCqdyoc+Ja#q$kFPm`1IZu_BHA;V97_Qi+fUMYn#!o^fHXjYHs-DXc>epDy&bfUi zP*bPSY|FaFGkH2pEgBq4q0K^2%9JuwBzLN`m=fhjhRYSsCIy(L23CCcT5bA<>e4el zUc6et<~Q*tYa~r_LayK{4MIM*>A@+C6nYeBkA6uSaDeXHq@n^0o(TmKWIP?Wx@GLv zG^P;z?s7Y5)lW&ed=7;fnfYUPD>;#7_76N^aqq$tx!hlOkY3F|U5by+h<@J)^9MdQ?_1dhG@^ z`7A|QQdxIa5R@c8*nM)6A-AC2^l?gNyW1_;9(Ot2^c&Vd$RtPVN$Dk9;>1egt{rNd zyZ_XBUf>UbGesXwH7%~9cd+nm1qa6&!&tOq{drZ``fO+O@T9X!qD;z`-(w{#erXQ{$PL$Jl&m@n#m>d$GK;GTXHK#1T_dNZ`=P>nS|OG-kyV4ecF$ zx}VmUba!DAgRIXK9uG8XO2YgCs}X(}+xkZ@)p%vZ7HZ#;LHm8lx@ebSjWAAejMlV# z^P|h2w^cbQ@4a^TX^?2$QD*M0X}cFyok?$N2bHA2k6Jfyg7Dfs2$1GZG- zM-wKFpTH44&j9E5zHmGV5F==&KqL$$a0~{1NbsZ*!QJ}gtHEx4p4`gLU0g){J}F$e zAq-^RJ6MrM`yS@PnstLE?C%6_1Y+dhXPjB>>dd~qRU~s)E}e)rW<5fZ=z7U`QZ$xk4v&qOK(d}U3Zr5`SQ7c9rPFMSSecpj z`*}<#>@3f`^YgbJSwMVPC*etH-scHgZ9q3Qos1hT-;wpa@q^3WQh^$gUy6#!hy9W@zk(BV2W+PY5;T#vA-ptFCSiLCQ6TzsIga0c z6K3}C=bJRK6UV~{HY(djm@=S^r_x#rjy_^5aWk+6N0ilLp9HV5_6hM2M-F21Ki}eK z6TwSiuVts~0@1LF&eMSt&`3b*N`7IUd@&bSPxkuh5^=dcLN|B)F!j+jc}H&H9^6PC z4aF8xr5PnlwBn_jg@ebvcPi{5Mx*Y|-gtYW*Wr741m^1Vf;+c&sBo=+jb}<^lGm9s zNRocs*JdFmoc`rvAJd}IDzvDZpmdGD;aWaNxj|^((y518A3#w1_ zsolG#MtyiVSY~AWkIs-MY&4(Ta)E{;D@UU(r6M#KmrJ}tGL*u29x3{#E6NcD!j7K; zy@!XNpR$>`n}T=KKX}?58C*E#ma2g7Q=Zrh_AA;8>`ZyqE6@u4w$5;AWWB_?5o27) z=*d0JRdpkk?z6#%Mox0Gs>%K5j|x<@HC!4bWAs2^H{aIL!)w8L#%!X4gNb+92I# z;=x-*0hcgd&^U*U>X9_s_*3G>ShF!aagryC$2>4jH90iR-yBvdF&^joSNm*KUH8d$ zrEZ3sU_8)Z%_CwxbM`~jLQHRN6}*k*Pg+iHG(jEz93!58ygL7tXT)ewk8BgQSjY6* zQ#1jjj300a}EQ*WAt1$k^$r#B?ABz*(m6~>3h4Zo9XLtoH0q>ErCm4Pn zB>JAHogS0?BMQT%d1V!Y8qGwRdQa-gtspMGMXPPiVIrc>?D~IY02OvK-veRiOq8WM z{~28)bng!o;rv}G@T%@FQ&5(GFqJdNNCWDC!{^!E3S+Bm5p%RUEaA-7?#0|2S@Vu3 zvTIPQTpa{ytr{5yF7Loz(13XIw{r^)Bq6Va>UAP2mXc@-Y(5}EKRQEqTU^FD66eph zuQcaRIhlMNzfkGI8c3G(TM_JID&SuexuCvwgfLSgpLvSrG=Ov;TT5@wzwQtJo;5bF zo2HDsWy)>$0|D~ms4*D$H)2u)QYWViz6x$I!S|PjOR{t~4Hq1gB40UuZvKXSAnb^3J7S_n)e#V@bjq_}XVH5-FJumhrwgjsHBYC73zH2Nxr~ipgwL~hjM!bB}@bI>n zS0{L6Mw&84e#BJV$8}S1{06x;z*ucgX37NP$plAl#51%GXpeTmccBHK<0Pyk^U_G8 zOsnB(L**iZAF8m7V==?yN5qi;S(n&` zO8Ute35LXF;f+UQef*T0-C8S;l27vN$g?pJA`I(h7kM2!QZ|cqnP?4ScnBQc5ILB0 zHDOrV!{AB~PbxASUNOrQGng`?T9rb!HqdqwULrWYx5Jo7t$!yjoTPU;gmO3}Xhcx+ zq>@*~M|1R7q?YM13cyVM=D=YHffz}siHsYqXdjUi8vI4vGfxjL^)WqF?Q^*WTX;1( zq>x|eaJ#hVoxo)G3pQW*;f1>CZ@udGKT!W%6SDnD{ZsW9tJ46Xo4eGN9s8XiqcR)M zj2Xd_%8{^N*cd&y+&hdg(|8CYb#T%ImjDQ!sP_}OnjK0>ax==iwBQGEdvGqv+C%Ii zr1&2RI>3(N$M6E=8{-Y+PZniQK;cE>!ER@okmW>=F_P6Il;7>Lt8X_P75tM{)YLRS z+}H4s3;%#AKZo4K9>CMi!orc|xK1a2P=9pp8_IT|+hhk}3rwzi`erd4?abLd0`BOp z+K08@@&ZXB_9seDF#wP%A_3x|;0f8_cFOLU6rh{6WA}`EMA}ylqV2o{D~R8G`CMx> z`!o&i9DaAHh1%Lao3tp7IOSQgww96UVpR0{!$U_$(}CWD4^)->5-YR}IbPh%X=~<& z2Dh*dOl|1VJM>;CgjP%-3RkiC^a~oSxy(CXZWK3eudff5M<3!{87+I}Vt5&Z7eINA zMpi8^+ENf;M8VG<++NnN+snVe4;)3Q$Jk?EPwJL@v^Ut7qV|w?yX8a6hwihEd&|J! zJkro>3KrWeQ9-W>3ovkUGaK5~R6>1RtlRjZ;El>p%lF*E1i$#Xg;gH*&ObtM5zc5h zR|=FdiJc4mbdLxMJ)nHcdW<;$1x-nGG&ng>F40Z1{a<@5ymF9k*Q)W83|Bn$qj9Bf zo6c#}dEo0@67T-U6*g%u3*T8&sE2(A_aZ3YIL51VpQxda?iwk5FPjKcGF8No&zG20F^aUTW z#`P@!hP}06?FV0f(2ADgmI;4e5PmVCU$X*n(j6DZ)@M&z&~SvYqIN_Kmaj30u6F5* zVa6~l=5E#2zPBIZG$i!SJ9et=d?Ne4;^FO*nrd7)76ldxFZP?`CO?2pYfK$_UJN}t zHIV;wS*O@Bz{!E93Ut0IMd~?d0u6_jE@7O$Z}TTyD_cJf>fnjQ1AW(#UT^lDRP-R&S{96)yb>Vx z0b1x-307>~Lr9z&MCoP@9Q4(YupNA85p?%jVr?|{zEf8FkG#y_DC1JV?*bmw0Dz55 z$dxigQBrm?EosQ)DxG1Yy`Gn=kEM4u`g?2&KDm-%U=Xuf!IO6KU~J-B$0n;RYxx4q zuk&|tpa0MdY3EVVlRKf|Vs;;#uTg?bjL%W0edyYx7Cor^RhMFAZ+9mlZZ>1*sr0_3 z;ZgSaBhxpU#u-;wC2SGQH@R(i(8<}+P*vAoO0@oZH_3*}>-AjFo3H#&Y`^;oX<9zJ z9h_&Q_x=il7(*0h!|G1XIM3 zvfhpuMX`70O(BzWB@TVch|Dv)xAJ5JUEc7$oFHI+H00(Dv0VX1ZTNE-dGU7%G(zpE}r6?`8L8LMz|rf+{cKK|6o2C%2j`w=De&t>^pGx*{5{mAD; z9-t1&hCxwueq*3!ZU*LviAJLHn1h#(aneH{A9?TTozO@Sc;9!JH}H72N%-y)7#5Cj;SGKa@e||M6`Hm$xeC(;2OozUR`+MK>s3 zQ7QfLYGm6{IAZ}qZ8<`bokZBL377-TEfLMHqq%^Qr7H=KBggr}&fK|iz%>apW_-=| z3V%MVP0EDRF7fZBbe=p z-aecYskBQdcx=jL%^|+T?M_Wbs?%GY8yD^P7OH>?o?@9S!L#D>-oi_6*WwS`a-6rD z0gmA=j3g+XgpJ-Kv6!@_VP4Mkgy_L&wJG;>M$_Zn0!Yt8c0ZC0%z2Hj!4-FzDZDHg zQkZ6(O7vX>&T&ubuOezvm||RHJX6#Z`HRTOREV~XmgC5cjv&*7_g93%xOXAFch2AL zmq}^#1D28>kne#eVif403bGzEcW)`970&!*3k~L1ZdtvyK>6DC`30y*ZL^AdI&)2b z`}^9n+e00ZYUh3L+;J-ubG`W=I=&6bKmXtARs0wIjQ>XvukVuFPJC%JyEPUR>-_>> zbNkZE+9X$F$!kgPcDBmEJx72QvN1iExqF$h3yPD^CLneg!rp$cFmmk*zp>(4Zu@C^ zX*%uR8ZOxoOs%-NK`_>brm~z|NIcQkQ=in`SDC3ess!J-t{~tyD9KfbX#;(Mz5#4& z9g${(=kmI3OS>~mXx^5yWbcTLL0^9}>aFB9v+V{C7r+p?CAY)H+24U)7Ec6%l!4@t zCR5noh;oa-K6GAMLkPMxa=C`Sqp-x2zoxgUqN<_B?IC5~c<^Z+jU3V69GCCmnbCLq z)~A1SeDUJ%Ao8*$p}{VU9F#y_hRto}G)*0;4V5@sUt{5`!)Q41NIK}6gSLXoWwpex z?V5=6K1j)SQE!HH7+N2~ zzD1igqN04b=I$AOIdvy?^Fhof*2X23+Sb-S1^kO+%$a7O6r*Mf64>+aS#K#7G3;Fp z*qD^5k-AmNs>znKPOIUwjqJc(K`toI&o6zy?lIzn1S$AL5CU-Hr`iy5=#x|KM$(W3 z!*@j2%WeRLGWm=Tz_ysxrUNsYOsW9IelaT5{nl__`l?c)k zIQj-F(@^>q^DN{?x0wqlv1>%X{n#PKr7X99i z4tr?xWlwV?3qWm1&yE#!!Xu5upbh#gxu%2H=m3=Qbr9Jw^lJk1_!W|atFw&kIEw}57 zdceeK$a|9T#w#mn(l9&ZkoUP#`3U`dl7pg6LbX-i#-&< zAOe186;2TAGoBQRn$9#HE-4yS623aWb68>iQ8=d+2Ttbk#cFJDIlW*@Y&~m^fGZNHkl1p*|z#{*F99b%@;^-R0*c{q)q6QNxz^T$L5K z6EPRoC8(}juE8mkOy=(0EzSoZLBHcgNmzHgex`N-*0pT4M59W4{B_9r{0^5RZ3p^q zexP0};vWtLswD@&RPYun6|)T@v|yzVz8GAyb|GXn|@##3cUw%nKzXMR=$MuT}~1nBhDwB_)moE=%N%xv;O zBAfOb+`k{<>!XnNWA9o1n7fKW9}B}(Rl!-POW&GGiNT|`;9=NuR5IFxk`iU4POG90 zeRbFvI^=lztIi0lUM+W7vBHU`Zq}V!zyzXL@rC>$%Z>Ht1jqH@f%-(2J&o za%to3{6$?~S1d}}U(eq`ipvl7)Wj-uNJt*){_e(OaQQJ3!4wMrLbrtuf+4LP2@CBv z0~AyUezCDXz|kO7X?${SwW;LezBNZBpR|rkBdV6q3tTN^1#k0CIoUEyKuZ_kg}19h zx#Z(6T||+Xu%!TN(=p1~r#gruS6((+c^T|7OmH7CA5coYA#`CMn~uxFlZtXVjrL)N z;O~PvG{W#aB{!Pr8nJnUX z6H)k-i3cv&ku6L$>$zUX{WW6JI#K__YGsOz>NBJ^#(~!xtGS8o(iko7fLF|nbYZ12 z{lFuAOlY%p?a7k2qeF~E)zC+b=au$r8N?e$8X`JQ8cTCzkoIhCQS7MNbP6azy|xZv z`ium>Y8Y;Ri}~V!WZ>ch)HJ@&2L|p5eRsjx%<}_h)O2dw!Egz8#U<9*8Y&_SB$1$c zy$)S3!hEVHv`$Eq(??=l*dP0uNT1TNxyyEesp#qzr*T7Do$ZQ}4+ScGBwM{xF0z`i zw-94iWYc0=KI085YtRZ5gL$;Hd9S>ePBx~`-#8%K6dNn;1SY=->^!aXH=6WKPwH$st!epxj*V()}H*VN!uyWZ_Ac|Cro z0-tfQ>-3~_c*_=!@g0)hvI%x$5FppBP(_xxP6Nsp%7S6*Pb(Gw(S%x?36~I$0$Jd)t^XLX0&SErddy+4uZBm*x60J0> zey!n%uS;EXW45*NN`>W<CS$s7q4RH*e6pStZyT#IYbUPZwe|w>1ze zZpcZ$1$otfj@zHNBlE4tu@=D}z1}xddb6a$KYy0NPx^A+8fx&s4!)P;+B%5&f|q4t zf#P~Upx)tKvBLAs$D3)~{u=Gi@7qP(#n(m8_kD_=$f6qd#*rQx`ZdPuJ#UClK8>K+ zP$d|ePzt$^2N^64P)qNO4!c{)65&Vw^8Q>iL3CHB1OT_=6=|veT(>V)8g0bx*4$o>B&Bmf1kWP*Sk+{Nh`}F z?rGiuOrg9Ob*ws!CR%-%y^ksT9x?<((nebpS=O;**1l>8Rh0a7|2fxNJ`|WjAAVLF zrXHxW5FzB~i0j>a4?ns9FVkyB#pTi*5K?G8x~ZHicPQM&tj*&^HCgJ%zOW!LP_|Yr&EH~PR+x%O7x!NUMp&#(6QROz?20W zI>M3tJ+}fEK1jZmTN9b zy?FWt2$2;;{d18afc8H6 z3kMqfq_OB^6ttf@v-SdjjO(CCij+^2Va8D5c#0FrN#D|9_aZ?N=k9cUSb>I$ z!L6N!C0ZB?>!re9-ORk{dHf3B9F_!Li*wC(bR3*=G_#gw`PlZiNO$3d%<$&*?04GE zf<&r!`pLedG9xKj_$b*cu63KU|2a`cY+CX4FOb+ig#Q2yKb~UjhYM&f(Elj51fn3# zWyo$ZED}$74Bg9GV$Hzd3-Ssrbq9oj4Y9B%O(Z4gguJWXOyR4@XM< ziq$3Z2fzAYjW}0Ud$r*G2Q9+7y^WOQ_0%xe)@_xA=LXXrTRgsS*^NN^B-nAl&wJtE z9hIAR<1S4A%uMnMhwwXKUu{O}te1C!Z^O3ks#KRg0d7YprhJM?b@Gt2N4Znq zOsQedY;EfE@!==yn?hlBz^~p`7S@`h%u4$ct_7H84fkO}^Ug+e)~n(Gk&NJjx5Cpx z#O%_}D}3abO;%%!kceP8V~`aB`d8d|#Q^W{Pl)$7lZON|)(~ z*%^seA}4ziUj!YRm&2ABnbRT5`cDGn% zcUs$3&iw+d`1|b)XAvN$Q8RU1cRFnxj5DT+RdQVFO_PbX%}G{ri^hq$4$MKw5rEtk zG9qabw*A+J(WsZulA7f8A+$k}?491LkFJ^%o<+3njOUd*Yo}sZC@CF%7;zKpf*Q^2 z!tchrz#fe0F-5t~Gz>FcNuR9PBGQs;R&hoXu7k>+;n`Fz?b<*i#8X{S%ctM# zgoic<@F?a?=;H=8b^%k8bL8B_dLAkl&chfh5$ybV*>+Ha*(2JD3a$BU9UgIn8kuq4 z%rm7E&e1Fi^-|!`Mu^~qGpit-{CTM4ZURwvWRdQiXOVeLUi!{6bPJ_g%}tPH_tUzs z&lZLbV!Z`iG1Qm@8QX>ncUB$1h(T&oHt%)7F!+ep{N@^!IQsQGt^GFfW=)OLcf+!t zS{&ETd5jX+T~{Xhz)|VN=pVpNft%++^CYoj>{r}7%yh!~BinRm$`O*uR{x&d31%d8 z0w}J~z1X~kTjES#2vG>@a;=ZoD}|fp_5-g5l1}Xy#{kW46^p*J%TVFt5)Jeq~Z z+o~n@+{%0%Qm&Eebyvy$La%_LPn(^TE3p5(cIlHNNuuq!KP4w+%X~h|Et{gwWnDXP zP@{1~&iZ-($JdJ7^55^LE4d_|XP}Ud?34bh32ks0yqCr}l4941-HDc@C&jcbZ(}Nt zDAPV}rZ8?E>#D$FJ+as$-9YJ=-#`o2ae;<2WP;vEr^`%JfNdKCQZT>4^`nbC0{n%E`-yApK2}DU>0ODG2 zweT;QcXhmAURqRBF=W4k%W3@doD3nn;dVP!x?M=Es*V+oz1h<&4rP;)av$UmGUX|P z76DF3^Y=N9R*rFB_VSl2J3VC2X>apM;7@7@F33~+eyGG{W1bbRQw)!USI4y50!gyU zUX~u1tJE@2p=fs|R|3-1x2<1k4q?7=QH?&*RBe}aq)XKD%`gVJaH~<80$c0>)N$i*bXG5QvHbh$<`7A zgzwhPj7Xm#9S5HYVq8%K`TprU@@ZE(A3X>_EV!p==a%lZd%0+30W~^~z4KF$ z=iR=PF-N`S#bymCk}515eNv&NH}z+4_i=>k797mdQh+>mi9eyb9!Y@3$Vt#Z^i{Gz zZ&5WmM2J^A<{uDZcno0Un`# z+u z<w&ZZ9?>bF`9J|Yf=3mG=1*)3-kZgG2NX(VAuMer(AhwXzhrDmQSCkfju_JrA} zZ@`wmVMF`wl=Sx}VKwr(8bE@I$O2qu(sN|FGS!x%-@Fg)Kx>DK5Ks)$;mY1Ol?y%B zI5^UJe0zRGofSj3blS~G%GmW|cd!AJ7+~1s?9s{qWxZu97#w8#{d#%D`Q84Zvw=69 z&AbIaFMhs#I9N9=tQI<(ZQg$ z`Rf(I{EuY^%Z*FwPCR(#avL?K0jZfb0GeC}w!Ekd8`G>>`Na^fmzaiv%u(tt&4Y`x zLp~=5mx|&|`71e96Vv(n5~DjYEoeCCJn}MmAYGp=Bedoh`7&C-y9+1kZ@ydC(h1^z zD#j$wMBbdxeeaT1C%J=0nb-olm z(RV87@HwHB(_-LOWB5y~1$qRwH30|hk7?J~4Jg3xaY?^Y_An)qmJO|52OBuw)H+>p zWMg_z*Xo+B@=|fMO(ZvSc#8{E^~CMJaLHq{R5iDkIOcxmeFHE1b=5{vCwvdekCyL` zic8??bxug*(WNgHgM$EMpAmPx_DpA05!ySF2kwI|WI-!7L_A$*ye8tSmu3RrhO5ch zut#0`1#`_g*YAc>X+`86*>tz5py*~*cUn_ZZd_&Yafh9ST@a-yYX2o~zlV}s$!g87 z)qQ;4`Q(?ECtZ&gIq{ymD-+uj0G#xS)THe8eu0wNk)Vb)6A^+zA`^{ezLynuZZb$Zb>uxR46jBlE0 zn`wT=4G!ExJSjQ1e7O^!gcn^SLI;u5)K@))!345c>|iq-@|?|};UnXSe41?O;=Z*3 z`a|hw5;tu#K_!idWCO*`h*?bkopf81;@|^5=f^nIS_wB;`P0eW)O_1?rxbva`mFcgvJL*@ z;P>DCz;J+ssTaGFD4nt;`=_)l^18>O4`QcfQNV?HaD$I21ZK(^zd2?k_23Kx$N=DO zzOHRiKx*3+k@oQQlBNM4Bh?ls(!c1#1*nTt_Kb&k5CKgh;#JjDzI(W zSWP};+ANiyHo+al!OUnrFpwFm8)`UpI!-mr^%9pCmhX9YZprYkU-PDZ=aUJ!Dk}|| z;EEU<|J{29Xr+Hu9QeD>VHsEt+flPNc;zIXHr^qpI#l{=!$>N)w7s)h&2v{t=cmZx zg7ZrpOnT0~& zrRorT*z?$r;v?cJ@=2Vb*|DoKu*bKJZoGVT{+!)?LA9TMhUBEeEGHtBvSo%=0Xx(P zlDnKX*mW!kpTJh?<+?T+e9H8(#mrHjX|K^?msm>7rQ@8q`K7L|-ZSMf)`Qor?%%(A z!@`J3Eb(TSb*8_WnjQuc~ z*ml(q?GXic72_+Ai8V8 zujiL0#6-niTEsS#X;{)92s(7AFVV+nANx!Fj@5H-*Q>92~mJfX;!+#blkcVwWy*a7b>eQF+-&x&0C_;{HR5 z#RDF%e~4HAHh%r@ea%CH{TDFxmSyNDihVB94|pEqBk{keC}i_76f+XwK^L;H@+TzR z|2Pr+^T&@h?1sQx?lLGzUyNA%qo?tK<}h$V?nUT<(D)MA51_(;0D}gv$DQC0_xt_; z^V)1ijCRwjV<&COUQPt;(34!1KgjcxOSbfwn71bLYzl+SdXFV#->XLhy@iODU`sE& zP#alRgCg;uM5*Bmz|zEAsEIpMtxZBxYp$cF#tggI74y+=3O(-qczz_c#L*qpm#3>CSnm_b`3s-iU1`;8pgIL zK~xUp+t7i>F=4-9!L{5ap{>*Pzk2L3On`TyXpvffx#z>2lkcKml&BY*3B<}LVTw|POa z4bU|Cs|9;K0~Qff?5)<<;$CI5`MAgrPD|Y~5*}>fz>W{Y#~wFmg4I0djw7HRU};vw z@qA#-H-R7%TCtGIQXLs9FyD;&wx({-Hg@;x;D(6KVef$Y)=`DSaN`2Ig1>r`{{t5x zpDhD6=k}vO%MjpTY}q15;LJy6jKV`7jo$SQ?NM9V`pa)%9t~pi`2$_X-yGyzrjj>e zV}A`5<8dey>j0emnV z+g=E)EM2;1H_MHrod#FR^c{Jmi&)c_a;w7d@SKBePNvcER~FB3BQkbx&_*D^zKyt+ zfZAn3L5xM()BC8L?@srs_R?V;+Qjj937;{&zUp}aps(ZlIP~6-l}^lP>{qFEBO53R zESuXSieADYBMxk<3R)~+>je6g`n1LC(MX;Ll{bP{QB_CArMbHm^y6Ub=uUl}UqAt6 z0VHr4O^`4hG!uPb<1w8sWFXGZ|JHFs{auelW??X9f^g)XZW+fZNggljD(90-72XS* z6STf5vvybnTeyk_Yl8{RHA{S1e5|wwlkO$^ywX#j(FF}zxMO%<`ejarn2 zK-yA1H*xhNtJwSCC`si}2>a(8a!?mNvSrF9z{_)~AunKpn894wUAT-x!7Y{K5*N`` z@+`t48eB+mGPwnH3HvwYL-<=ws0Wy1`=D40*ad08Mf%pM-H`IlC0Dr#} zPPQFG5QOQh=Ok+*$81Rpl5W?F`D%J+PI)QR!j?xOAuhIHFW9uGx#qpqbOM#rm28{T zEMLPV!A6W6s`A!6+WnyZG|!fS<NJSZw?Jd#pHwgYznAC#3*`0h zbbS8d|9+r7esd`Jf;z$(CoMjq#G?!uk;H*JtCt}o!nPGFxX z{}tcu5(@)<^S`IZoq+%T`u~E)$G;qlf1%g(-+InYj6&SlrpyXT*ckD;YL|kY-kG~L zsWEHJRrR;s{D0$-i^lVxDnr?kg8uW0^#yZu+BkIk{o=Ck11uaq;D|aWQUH)vIo{kO&`h%N9Bs2`0TI zmS*j-|BJo%j%sq-^Tt6yq=`zG5)}{-5D}Cj64XNz5fPClL`0f^h)9!=AV`raD4?M9 zrU;QPMMAGC(wmS1(jy5aJd$$W?Ong${O&#H{@!`#&a9ca@0vectR+c!o}K;dZ~K(S zfLTSK?Yj?DxDeBNZ6kKK<}6I!!@imd(SOCbc0*vC$Jq0Wghr0S{*pW=O5?_#F(0eM zqM-{o5c_lu$E^%ES@qE%^ilS&tr~;)DAIwY9|lz>kW7!>rh-Xy4i6E>+_w#?Fhl88 zk$>`+BOaq`fONtcY8WFF?Qw=7P3;+|nx&nj>(|D#wK6pa2~Uj)pB+nEPi5BdoY;S( zS1jj)rKNqd1R=~9{ih{r#DN@|E)vY-!ceve33X(rS&de7jIK1f_KC7g`Eb<~ZWlsF zt>sY5Rn0>Q+!ar|n@>K~J2V#(wnzu3PnaTjs()+JS;mpg5EK1OE>N(tj}Lh2m{Kk( zobs~jIyhe{-V{OL+ikWF(UVarc6B?z9QIAG$bXbC;acL4JJ{sJabR=k0p5lLA9t57 z24{mi!ydzhtL%?>yz#!fkkJybEmHfe8^vf})#xd|mlWQ5h=rSmU>-TkcmiKM57mO= z{wzpq%)4nE0}K{(1jupdzXj6{puYllMj+oB40cc6|GfOy$~1zdxDT-1YZs+}zmFD0 z<3i1v4;tSG<(e-WSv-QQhOn$?JbWV z+?4{&mbZ-YS1T{TZ5Q8C)2`Q!mgB6upPf6&nPAlI{7XbrVxQspV;#Rp;Gx_bH0)1> z>*%fTa?HnyU;hwD3Pb-fAl!dzSE*k>xPNb``0MxoSBH%#xXI)R59~IkbcKhq zFo@e3nQLqYRE)>yZGL)@&gZvDl%LU(WRV`f3~ z5gT?3zv;D!S?IxPuKgv@nD;1M$w_wEllMl4RqufLHF)&{?9ZREOq$S3UvM&UO&f5X|~_Eq>UH`e;UB0}Y3R7V^A@X=eQxwcef%k$1V(Y^_?y+#!IG>H@-*08vQhRg;F19`0G8~ zrVE0DtcNT>W&JRyE5Zl^?H##}HQ~5Gk7g+ev#L8|b8}oWlcIKZa}s@rR{cx7JW-}A zh)N8g@iDno#YW-y9GGBP)4sq1bXTTOQJYCt<)xdb_sg~)zHKBo>iD=wX?_lY(e?nz zNhCi_a_SEDz%Maf6lDlty!p}$ zj%{hI`h01alxx9Nf+K*F$wi~d+cl#&>2hRh7MewL{3$8YBNpbK*~$mH+ByxB85hf;%B5)TlY{pP4aonBq`Cr_?Rd z#+I_K!ttk0u!cHrxwzT5t|RF?S7S`&@;Gv!oB|MBRMBp*@l<0ZNORd^puTs=v$aps z9Rbp%!=k4Q(*B+(E36P}>fRX2TP|=^yPQpETVvH3lp;A56ilZFe7us$EskH(<)C}k ziw+%Xd~Qqt0ym-lPt;?9mEU2fLE$qS>tSEGF%zdY3`zF1BS>~ewXT3YQj(GVKy%AW z`%;hlJ#0hc&<~&WZ?>0o-JCC<=xgcR0}P2MV#uQyJ{aDcn{*d7G=TWr^^00K4sFrI znO`S?%+eZ@`xFKqoo06x%gvOrkIy1Mwd?4jI@31oFUmGwHCy{=A~emA9*fY_)U}a( zQdk!Dw}ZsZzd1M+4rf04zfD{JpX`|Y2{O*`+)c5+Y{utfI;tE))=hcDe(uTA-sAI+ zKfh^y)Z9B)c7Hcvs2I}>QWCi$`hkWPL`48e8V&Yu3!|<{8EFZ55JONuyetSmD)DOv zjwcb|!V^z5Vgut#)U{(mb;g}jCPG<#A~xVTDt{Uu)zxP5k)vW92i6+R3T&3umQ*CC zQ?)7om3ekDw+D?jaD);_hSeH838fnWtG$IG-@(J+5BNF`Ghq6jS0@d<8lu zSe%j#&RkFFr;CSx+xhnKQK`2KWwZSB$;4Sx`{%Rc-)>1Lm6 zURpxbKe)n)pT_`kvG3p@fT3{Cw^731M9X;IMI-g!J?*Q_0icd0`@31cAUQS(3L+J?vA>pXIxku~rOvRWi%O1O2_t0CSWf^2Tv!D) z2zyht_nFp*q8We*lIw8YPWdpiztGM&5ud#DLR$l-W17H$DnVsk+*k&D%4IgP2p6l>V4a9M@E=n&=6$;$&f~nZEU-Gv_y! z5a*ERC?Q#GfST-ZX82Zcw7Q96?9!1~@oD9+b(UXt8{Oc=Tg^q$g}s^jYZDvHKDh`G zF*Ptl{FA;+Q(PC0sUBQcnkTq+v2F%#`%Y@ z5{-`EOdw9QT&>f4BPS}W_o=gBwJ~yFXNUEyg5po-LAVzf=ciE{C|wCU9yBZFQ05`X z!ZANEVcUcp6^WF4Ymy(5+S?wT%p`hv7<#<9D|x9rDDJxK8NCkFTXf4&uoAC-OzqpJahBIZ@!^6&&yXXpR(}Rn|0cfw0uE2 z_iRy-(nh6$9TiFUWIRXTHJhMOZ;^!)bEX&3!l&#M`5Zzn^F%d)-N=K8qY_(=?CTXNz zsn8dy{AA%T8N(K5wa`T}p-Fha%RyCGOErY7QHMRG&VF6fam<@pDk**YvTAkZdYE`bU$ES5QTJ`*D5 zVP>&RZaI+K6@$+^cgJBD;{#Hh($_u%AHGXJua4Q@@$L)Leh$N%c9`A3cz<@%;kv6B z=DQbE8rmMkOF5es-v}!R?7M7Vvs3*_BnfjvX;ETtc(&$}6?fWQWt;3fT_#s$PImJc zM1+Fd&BG8@h;r?T5V+JU=$I^!z~T#WqK!f1m<`iWnb>Au_jo03)7Oo@mz7q=icfs2%;Ds?_$u|Qw0PDxWg~K*=x+UE{FmaaHo$Ssp~HCBB*$#d0+l*=(7Bj` z$cy!W-9`6bD&&WmsblsrZ2b7WMVcVi`S)Lh7sCh!XrN(rXQ zGYXL=6y5dTSu886D47wZ=K>@@GIEZ|ATRT}e*ad!@F3{`cdfa(&iUBt$ZWI$P6Ce4 zg&out11Gc%^B}6_q42Yu&jfxkYo3y+sp#Y47V)1nuHI4*COER4P?>r3L2e3GA)k17 zN>pZto9O_kC+3Ne7C&Bdb07`SC9Ao~-jFeu{O}GyLO3sdsHu5j-6@+D zG8`r|8Ih)wc`s06Q;6iiLP<@cNH^QcwR`h(?S73JpIR1fy5osHN)XoXrD{qa!KMIT zHvZKRqW5F$d)K!9gC1ysMq}bjIv75QHH2UL5R^OOQl}wMy*iZ-Z#*9>p%Fl+(y5UOn?)Pk9js~74ZaKYQT7BcsiDE3ydW0`q+!j!a>Sb z&4Bm+Bn!-)Vlp~2l?Eb9XQxHlu#SU%W)(KJKNA%hO*5)us>^KtBOku`=N9v{^jD+C zm$M57!b=Do`t~?IAoy#}!pSno}e@+j-pe!r{`V3FzIiG)go3GN|5}NSBXV;Sjut;43}>y^#fZO-sz+3!Z%KU zgsfN^8FV8q>?{!9>7m`Fp92aKE*SvOfvvg|ivSfj=aw8fD#2?<%)6BH2j-fiujHI{ ziZx$BtyeQX*bAiYN65 zz67FbcZlMrPsMWlZncP{fvDG63XIGDnz~(}+v-22Zuj3>*ZrTY?)Xpb>ci7iCg7ic zbyYb~j7qp$hz)yu&a$o)6jBJ9>pIzkB%PIfseb9!_J^gYrm@*=`T;D7bC#bJx)VhQ zJ>1sKF*et-0cWRLG7vU-gehn-7`}DyIyC+l93+_Au^?H&XR9)vH9FF6G5bUidz(a; z_q(H1zgC;T4g~rKCny8>;O5|rt#M0h{h4f_m@RZGpBEO(RH7rmkzWdI5IC!06Jb-- z;HYM)BNW}t2h*+{lbKjopIlr0d%OVx=lHNMl_Js@whBp4lj`oNA?h}--!BSz9r==S zy@;Ta0INW+*5dmku@xE->c6u{*$18+GaJO24EwPb82c1W>VL;h6e`ZWn|r%@;CUSH z9GMl!uer9s6VMVM6lc!}Gb&{uJ?yRrIACiPUmHEwSXcebl5%#MxN1FTA|g-G>#g8< z(z$e^;eNIgbtdL6k6s<*OvVw+NTwh!qEglpblAv!_}ZYnSqI{C46e-hu9hurgLH%t+S6ls!~H^p1SL0z}0*J;<|mA2M(4C|7SDj7r<* zX)tKwnF~{W&ew8K5B_mHxIoZAqQO%Ho|OeD8m?;4Y=AuOYX~6FICe3jJ-w>?4HFGCrKx zf}S*?ssK-|UpYu2WC||Eunq131rsvPxK%6Lk!q5D@*n&n8uEJHE~;~jmULnvA!W;u zE|f=Q!-B8-F|%)$Q2TgGaFA|JSlJq_(sDj?IC)t!Kei{OS?S3UBa}0(OmI^OsH!3= z;?sXfOu>4(I^f}OvYiKdf(aA!$MDoT;j{ z-srZ2&(C(I&R&H}f=a`CB9_Vh6THz7@O=40(bZ#N>hMTF)~8FBh%=n5hSrYM)eO7p z-P&-B{gUkJByjOPCN;Xraf6$zxD#0Qk&KCW=S4rQa`Ta=V*-1Ev89cEsTJ;|vSDXGbW~+kl4gLsG>lTEdqN%sfl=*~??dfz-=a30g}>(Scje{m z6Lbqt2)+I`>v+j^#Vn5tM8w`3y*1SOh^l0**0;U+&&FB!(10;g%Nd2nm*T3j!gNe& z8gzTcOTfNElT2cZbu@?{b$7m%r3kg?UVGo8K42oyo|l$#%j%|v0z2Ez1Zdj<3O!h_ zCCF;F6h;K{z7e*%DP@ZyPjO4zPoJ#12<3M&}em|&Bzy>Y3vTJQB%yV#dfHeMGhHuYK@@3Zvs63nDxX&Qlv zOzr^UniPZ}WOeaRm^Ej@hLn#VK^zE7SXzB-=E9>CfA{IB&Y#g@mim4&EFJr6`4h>rgA@&|@l7(o}ZFJ6&#ci=GN_xPg_!!Z%f7glCRcdm#2f7S?XkF1N~~QsGtV zZY`gT@FiU4k-OgOpbZ7DEOva;O6*M>zX?hf5jjf8Hp!2y(u(EM(RyPo&UczmmB1W* z>C$_-;@x?@BY{&T^(UM~<5Oz+dqCB@LUjjg)QoPCYqSG9IRa5J7OIYjb6};MU92Bk zJK9)V8R*P*OXrPjM`KdcCoaJsQQz4$=d8B8_4lSuE{DIMn=`G&eM%cz6MIwx30sQtAx#C7m`yTA=9@gsC! zm!CO?)#pzr!*O3=G?usd``Iic;yk{0tFe5R?QqGv96Pr@u zs%Si@tyr2gkhdUrJVFZ9ul<{@4AA_9i7sU-hF?g1P|X}n_>?50pcGe}ST%P$ZkFv8 z+XAi%)6x$Pm@P(T<0*=`Dft0zkFEkzo#I#f1gToMl13eS^T6|T%|v14%wd!8los|x zC1DO&;#j@>gP{*M=!cJDL@%_NXw8Tg`zEt3^vq$`on#JYUArMBY1 z>rzi;(zot;ccIgLz49C;ytkPP?g#A)qw*)+(gTmo^oo**oXH1dxv|-$+zOq-!^c6W z`%6B!3ky6YKiUd@MYrhw(A8LC?|Mg;9hkf43GbBVizEg7 zl+#<;IoRCr*VL2`unEI{cm=M_!jD|DTnK!fk zEIDDd1^O9(PoRPY{oh`}_+On${97g&|HUJ?4G?BHby`ZkfjU*8+RmH*Q|j1yU|Xw+ zms0FU>F3K&*LbF0UWY51lu8TBPBaMKnPV-O`HKyoct-k`urXd9I#?F`oOEqu{T86aSar zG!HRBTcf30+a-_s1jHoq;luXLhVV$ti0A|Mp8~JP-0@}esK~uTw_Zuf^GD+=4)8&&`gd(Mn$$gHn7L6`2=EBY17*OfO|dcV9$yJK|!cNRbCCu*reG>L0(CWCzt=hCiS%}sNPZ1g$| z@K$T8-13+&xJhx~j8vP48kj_{2GA5Blgkvdgt@Qa43-%=?$*Y-JT3A z2`)zMe|ztPOlWIC)g$T6i+~@%NI`q@Pgc&tN#cOO0TV&Npy1X4x=ymDUqh_DEs^L# zyC&uLZZw+uM51Ork$AG?=pMzS7~+GhsUm68?1WcE6Zseuqjz&_QM-u|lSpJrpv4U^h^!ahA@HN`BrGP#q0N^2h| zC-&2>LLER7>J2Oc=~Q^b7K$;M8sPhpDEqNj@9R5rPMdExWRkSgOdGTo9_`R*$DuxI z0Rq|(nrGWoAFMfZDTswjV(zpR= zOvKN3`b>u#4b`UHuh*(6`8s?Sz1p)&;HB6Fg`3&<{h11xa43bmMlb}~AVCSrDzgp@ zHaXD6Iaw|KxF+iJbHYBgr#z~JYxh0Y!3Z~=h~^KJqYf|ws^M|kjrH~Gr64`!PrIHJCGpeJ7C*AG4Hmq}2=H7DZXr)r} zgz<@^r(Uvta&9fZ(tLZbW{H$M^ayXpHtl|0N^R7$>g85n)zId6*@BV7k7ct4-ec`z zpr-LVcj~9Ccb>}X5V($#XBJhDypV9(g+Q%|Pd@Zg3>1>~N$&2&%Bi2O@M8^X&r&%` z42fZP;8hxQVHE<9dx0hr!a<_4?V#9jNiyo%n1;>NF4T?v2O4}+v2G<) zhxrZu4!B@oNegW6DXa(tYugCKSPyv?eyAKMElim4SjxNcj!Wi4yjXuP&%U~4Jxnh= zI;*ye)In3b5$H{XlibMx=Lb-dfl=4r50u@lGWA>Al9Vz%NB@xDXi%HL_h`bL{zhW= z;${77g?S&6;V#9Pel$qOa1_qfj}#38rWLn4@W?`!EvEeqhNMn2Q5fWk@5tBnyuu98r0D&f1@ArD>1;GV<_PeXc;6@vB%*#P9&cuGPaw_0*3EdA9}2&nOzt}ST=|Th zePoQ1arwPVH?#e39H_hb+NkJe`OcDqh!?;N9II#K{y;lhd&}yoRSrl~cvG89g3h-Z z>!hGHek=rb&Qw#<05;eU2|#m&V26Q;w37HFd#n5#oH}`n4~kQja?u z&G|~d;V>7hm1ZH-&=YB=0>_B`32=P%L()PQjI$_=ZthG@of)JK&TRSdXi=9hQTjbj z{N}`Cu>9eH)dA@+)tJk<^+x`1lpE$1XsRYoo@v8YsH~2G({$3X!~g#BX$QedQva#Oo{Px^EOYBZx`apWs1f`JQq8M9VxW<2ANV*C5E-NWil7^D*DUK~ zWD-Zy6*sb|ssI;pgzgTN5OEU$)HL#K6dRO>FIM(0D^>bo`Xej2$iYW)q-sj!`}BH> zr^M!>&xUJrBBUTk0y4YH>7C&;Rl4HPaaapV4p2zV(4}{hZEenReC+EuI4Kx06Py+X z*BatZTGO+ee?v_IcGn}r8Y03}|Fr=B6+v@_0DnzOOEHGs>#f8{ z1o;|z0#0m&vf8CYKaA#)(T_K2pD(XF!(s-94>wLVy{C@wJul_Y> zXNfQpAQB+eAT@AGPI5YoG6x7S!5i+QllF|~ruWkL-&PcqU&%Xj<76A5y|r553f28>|Ah=b5^L zy2sEh$|yd%@mF;_Z@%%BOj)ulsVT+fZ4K4@a{!N$YJ}yLXkANIz2-$lS<7|bAfdiD zG!=>lH4d^Ohj%sYf!t>^>b$nl(N=>?lQKT7gGyuj+MS(U9QLuZ&ruu}5b;>DIsebI znDK39*I5%l#jdbgP4N)3uO5Gz8*{8(24sf=2gj41 z%03DGaOIvq7pLQm@t$Jyv~TVgHIm^)S-S&WhFVbkgGkAnzPq2fay)kx{dq_)d{gL2 z!0#-w$NHI#`G64<@$e5Mt_}H3{(r32`EReg_iw3X{r$by}jH_+H&C{jSy zSO=m60ZO4{wa>tS+N@(xX%&1@j*!orsvGcwq+^=D@St9w$^FA{p~qjk&nd3>Oj;3F zG*u0ILHqg7q^%GkGHey}%Q^NyHZ(7{hiOsxiaN4uJC;jhqR|E)e=|tCEYUwpF)U87 zXZPBc@3GJ}F>ahZZM`Y^)t~CM$$Evk)p^9yWqfEsV)hc`$8qZRZ3mfwweQ2yGoEM zu%e{spuUR?VH7a6Aq)Z5&L1VBanPka+l+X7V0CuCns)!KWHPlZ>tr`pg%hYO;&x*l zare^(;GmdgFe|H{mpnTkb|z5JneF0@4;LPF z!-^f?5EuCjvF43e8Pb&OA4}w!I$6rsnd8LQ{-*K=VVbLXRh#?sdFqhkOc#-#Q~+e3 zWXTx%zNxO6eT;;%A9P1T>18qGIq!!@X`X54=U2^qPTb_|)si_D$|5J}K@GrjlJ=9~ z)Nry_DgfelXuC!eX?&%h-Z=URuIoL+^vvgbcB-XF1LS|##payZw0fYgYkavibi zTA%7Ky;J_tnj#+4DDVI;r8Zaw_{BLOQ*7{{cu;=~*8mwGnCQ&uXNM+z2n+$n?&e)u zVn$yGoi+hZ^O#{;;%?~TT5D8unyvG@^ux8ng)O-jUUTv+hTmuJHnWKP#92EPL4G@i9e(`-$(p8QSE+RMOI6tfa;K2*g(m}MD{WG5_- z&YzBPbJ3&ph{v6*Ncts+D%ufb7KL?93`HFn_(6vgyxA9R{StDHeD>q0$zu2Uve+`* zEyUvWj!j-?m*$%{rrqx2^iD>tm?2%zm#QwdQUb-oAo=vVen^4s(61X~c3y1vC3;!0 zF~>U)6+1;?jj%G0bnhoyua6I|nGeKPM-0C&>AQ2=zU&-f&Dz^%yMkNp^~ zrWE0Zf=P$vgKCYbB5JM4CU@5P-raIx&yfzZ5}ZDR`wrRBAHRfx$=F{Lg7FdntESAf zQD2OBA{8Ae$$jTXbrehQ4DkfpmEWw?KbvcI7t7W zbVzAH`ft0+7JZP%!a9{NPAR+l$=2llR_dklEmPg;`%7tAjS_jd0%kHX8o`YcMcNJ> z9%xbs3^1h#6*Q=|2^V-s_I7?caq)>DNrq*w>MQ<>*gWJ}GJoV6Y0GTyA`l$^T<~HD zpFBCKnXafeMSnh?=NbHk6wZ_BV{Cyw{B0dR+0}`yOnby+^LWM_Rhw$VOIli791&jD zDv12;qpr2VK82?W%U@1NU%8QDDT9)R_&b=wj0CI)SY*Im`DM}_w?V&3mTR}avN)Oe z=(wMCH6d>)`h|8xL2u<}Z@wBHPk6(z2oW|-X2P1%>Xx|H1CoL059J^Gx|FUK2Tt=_ zI_vBU*#o}+P`GRVBN(?|5d9pvuO(eBYVAj)Y~9f6;o6i))%eBGS&4m`MVhBa9>1hd z-e9&nC@~t)_&m@UYh6ORYDDUYJNpo59}T{YJyF^6@$zLPJIjiOq)whwuavXWwPx8u zd{qIBlIo*a%C9j`6s5KqCW;SHBzCo#u|bLFC}k4y(-3Zm&eWv@$r2^Cy z6L~za@O*&g2!yxQ&lC~u*)CR94Eq(vZ}Ai#zLvg}+kHVXlxLN9D!^4t^X*ln1{8Ao z%{z~0ZvK2hL`5I>w_m)z_5`z>+n3aYuPEQ2v5XRz7#MVVjS$fu zwdZM|^AoCwGU5Tc1q@-mu@i>Gg;n^hSp;?$(kVX>D5wc~X0>HXeSJ_jI%}S}a837$ zOEv6cwBEbhPrIK~x*?h@&9LYe3n0@B8hBOStEb?r8?;l4sour#iU7%n{cs~W6)LiSOjpP8-`izjl91FknSF2gqX3*}#qniP~ zSP4cl$wxX-46+D8r>NxTw!X{BvDU|BV-2d551JkwZ~P~5cEEqAYFyb>`L2n&zP;Ev=c|U7`ECqt1EoRs zL>@*W-69V)f+b%^JVDg(w=8G_U@3p#`Joo!`Z<-RHjCkdM8%i!mHesK9vg@R_R_ye z99+iYqdmBR&DH+=CsxW7la7LDw&dfKemu(#HzSZ-8-Y~Cd-G31W-YdUHAw~I*$Fd) zUu45XpZj0a6GUC>=Y=k`X7ZFjyF^C4^eC|Vte$pK-<|t>qI#Ea-5wUDuGF)p`-@y& z++aF01;DoOA$wEDAPU(DSY($FF^|WXaumOXSe;Wo9;NYe#-k5N#@mWY{p-eo4Jpo- zMV%x_vzxB?uGXvxyEYiBt~UJ|BSR<1mvUsGJ>8j9(@=V2IreQDcs?Ze>kPAs=6k+1 z>Ytv-N@Lq>BWkpIN8m&@Z
yn&*GOnKZ7BR@s!ALd*?a`n&nK6-VldNYwNP_d0E z2s-NmdV@;u)XfwKG>$4RzS^`yZ7Qju$3^c2&OdjzK8}(?eYu3C?2XUZr3xsH4Yo`x zO9Hk!q+L6JzMl;*7p=0lPx<5qfiW^X-g|*PVBcN;E`1?}m~9|uGgXQg#5P~2yon10 z3Y#q;hUf38$-J=e0Ch$Dw8X12)z9aOe?2iJffDqSBkE^(68DO7V5qlK7e_PClLdOE zT?P_%pEz@6$=zF#qrY1PlJAdcRCA@-pt?|}8-|sM16iFK2`ZWTb|JL~gWnl`f*+Tt zc)ai`&WaQtRMtTDCTPUy2si?@xJ>aO?@d1Cn|8{&!MB5heeUdP`+iu|Oo>J0?g>tz zrQhb}MmkS>Ma$=NNOp%u<@ALS~Z^*OL{k{#h?2~Pb&aPmpoK_ zsQF|NviYUSMTlBmqM?}`ucVqWTLU(XLxS<93qgkzPO z!M#PRWz`#IBrWgF*sW` z^Ed~az=;(dh~hvGG}8ejsFgbVJ4-xZtL!7wxq*_>BM8i!BS@f?bg+au615(?++Vky z$zLF^1bAwx1XN<8SJ1Etc#Z&@8F_#s2&TH5}S`+8i%kOP8ya#bB1Qd>_=B6k~rT$7dp7^jfx0BqwAyRz2uTdo!tr^ z*<3W`DY_)#YE!S_fe$K%-*hNU~?|hkm+9o7xw0wxow(iEiaR=u8XW!I_lRx2F zRyO7U@TfIn_9H4$Pbb&FvmoK-z*}QEv$q`0mj`xHp-%&sjP4`BtUEv7ZZr8h8EJmq z%s%2;JWRr`!D72~trt@yVAp0Wq=8{_ags%B5<9fbT0W|^@ARIgVky@u zJ<8S?^D&gU@OxYP*({VsLSDo>Huirh0A~OuaD|E)b|J&)2P_D>)f6zH`sCpPoC0{$ zHNxNJb8-}Xd?jsKdFZFPygwV8q{qtIIE}fd;T^hl9&~lR5}{)naRRf5jurly#q@ho zJpinBe=5wq#_)UcZ?giu4Hax&8=NoD57k4&jmXp#?;JPb_v7P}2Ic3x@5Tz68-#Ng z$Lx9wgdkZIFE;K#^(iRj_?*kC<6!7Lul~Oewf`uk{GUdg>tE~N{-6B*|F%&3|60xe z|NivVRR^kC8UU$=Q2<0YHv3b5{2x776qf86K+j&oM-8=S?VyJ}wsWx@VC+7=_De{I z|Nj%x`9Bh@7FyUwuU~x!#6Ek?7<;U;t^U?Z-jsC_V8jl=KM64T!BsskggBW!*HeI& zb;vF3(_>658o%ZJ^P+bsdNsuzu@f+=_;Xh_|NrRa{L|~@{Lu_o2%lB8Mmj~d0z&0t zME*%N>f_P8uR3ys40M%)y{1x^%rV1*DxYO4RQE*ZDYbLc^@p(ddO*fZ@VrF{XoZbC z5t)^Q_;X0oX(siIm$pZSKP_22VCITu3*zM?(L+h$=GsrA+6NED2EztQh3G4wvA=}9 zi`zMYh{=Kk+=0-}06-lVf_~3p3+fmeZ(8&_i+~N|PT~gUBY(1e;$m(L(+?yySk)*>Pr!;!mI$6&8US1zw-q zbO94;+z1@JB{q$pB0x2y#1}NZM0-0G>7^I7(Pl*-+k1zdN?q?HTGU@qJ7Zvd%xg0D zMj{lVL7MuV#ejzb`YyY?;81!C1>m)|sa=$=NI!lK2x!i%P}D33PkFd|u6dSyuSlOE z&NwT`+&Ad9Od3vcmS*=c90j?XHtp+ulk%USt;WvDD1v)pCoQOAkR*+t*>VXfkWJUD zAI31{kiQK-fPN4`mn~ze`+@yQp2u#5^>sDLr_*oT+VkyZR0sOL5WSB67q}?uE39In zl>x`su5N)(#Q_XAjC;xMb6G}_iMycuV0nOdL#oRyV;7lps@b3KL$})S=h#g6KfJo~ zm<0qAW=iu1Qnq3AYrnHJXP7aM{+$K;!yihd3l~urql2C?n{h&6D#{hXR-XL&Jjo#} zD~JL>4#Yw7Yt5h$fNf(qb-=g|k35hJZVkXrppKGjqrLbdt8)=fx|W~VU+rdD^^>(@ zzt`Ja%RRi1FO!x1%61xt8qgQY{zZe@^~ZnzTXBQze~e}HKO!*y?Al>}I16F@MRW-P zLEEk{v`g0RJ2A)AUwh1A!oAi=uori4*-X?Jy%xLA*&B$NXf=_jJ&FYd9=-`U$?-E% zu&W;f;ozwFi0T{brif4uFFyRh9|24O5!wL}f^%1Y>GjVogER8*>eg+)c!2`^9{xJh+!UCQQeYW`_(w zgfV%bBJvm>@ub?`ti9<_xY2MUJmge$1T+3N>Bz^eX50~V_wswi#)ro=#kz33EVl&b zxL*$W;5HXe$xsi(C|H`Duv`v&*Lxs_0t-b{!CNttt}OxReY*Sz-JAlOJ83)W&vZ=r zcN{X*?7n25V2?=J!jpo8~Z)E@_>6uHh^3h2thqGb$8plj{wA#4|(sP4A-4g z*vGn}!z%4g-jm1knlh)IpTUCK<;%l@CRs2O_4EgfEZ~XZh>b;4^^ssgRb=v(y3W6i zN&6`gh$FzT56tyL-pqnU)B95%6~y>!)cE4xCO7|}9PZcu(f<1<*Z%v%PJ#7is^x`?OJc zGWW*iWMtNZ&k=kbEk)N~Xz!Utx#($L{w zUd2sd{Ou0g->5+}+In-E3;Ck-;BJ~}{r)P4-y%kL!;hu*aJmTr4ZZ8=U)bw25E`#v z)GEn6YyYebCQ`+RzGfGKU99&iEnPP`b60xKAceIHeuDT8HGtdIqzwI$WvTQbR)d+a;V&%hWyuWn3PrmAJksYihu6S~OLYMdnIR0X!eBbp&aJ zvkNKkc z{Y{;W^rqe8fq8h&*i~IN=CH0J6iYw|o#S>(-&jt4_@yAnbk9+Xy}88>hUlm2zHwc( z*|H@#_#H$-R#j2+wB5xKrYK#i4xrtpVuPO9+V9;F93?r+c&@)I+hyp>a}?j=jC;k) zipYYIFQX^Ahz=QmAbpkbu4*kp1kKuX5bD=^Ka@>=y)xC|6I^oP=#n3sv%7SMM>O-& z$;ev^5(?dg-=pUJO+Bc7V>L@mVLR@gYZYL&KD4e2NDpCbs4p{TL90;~))jfh#)AFvIDmVQY&_qL&hfl+x31fGe@X_llp z*c#%f&n)^eiUE==YEc4sWd2l#{_U~+A60Yy51acxH_rm7F!>hH=qtyy;CB3LF*r$0 zC$muW&?)9MU|Thdw5Ny=w%Z`550v?aBDbVZ{+(mn=2MC=z1zP1H%i9byR|8LuqJgY`~w*IUeW3lF;=S(xoCe)6#zUym#`&8q;J?VD2nTFgj?rP;Oi4hy4aiIT_vlQTYReoERs#xTPDQ9-F46{YdcaKf z!V&rFr|ai))SWAn_tgC+yPH!XENH2DA@`RxE}p0&JjqV6B6;8wxq2GJ$lGDcYbPS= z=IJMTi^UDT-9NX;cL<(culREfFrUJ%0@9Bq(&vrrb_Z}+;E!{VyC6Xh3~rrth`#uW zKZ(?R?TjI>boX7uk78JSDC#gJq>73KEq71cq?|*-LcmMD+St@D-{!nvb*I;}#qaN7 zVauFuhUYAAc<=iqZ%B{}Y_uen!Y;YRl*BeH9zJG$o9$!gNo^~_J2csIjV^$(+NaTR z)|+US@I5y5nFA%B9TsQk8~_hIAvQ%=AkBZrKO%JQhud4)bJvn>xMQjI+RH1wp>Iw)(sR z-O&iSQ_!8}>RvJy?z+2&f4lD~Dflg4l6m8mymT@LYTg!r5d0Nu`b?@GKWNp9rAIv2 z^W@;;I1aY=c%V+^0W@DwN1^oIMWi$%uIbE5K(_e+owI8!8|Q82JOPE6=Z)5)RD)Rrs8ic}jitL92#jwojFNDK=T0d`hN zlLoX#fl*P61f(XvI)Ze3gc>8HjoH7xH4Vi!;NH)@ zkpuGFor#bNmL3lHyxuew2Y%2bz#uBN)^+shMh~UR*a(jLf>vK%6ZDUdPJP(RAq(P& z10MlBX$A6!IJg^JklB|moVy-4)>0$JH+P^FY*5Bhgu(Tsh;=s+kUYUl&ZK1!joG^pV+A*(Y;9)dbTPx=4 ztr(_W$qoZN_#@cw3Qerw4_g+XmvGjYIqp#Szl*%Ou_lSM0kHXcc^4>+Xs_7T1K-Je zHl+os6#B?EDp}4EC2EtAuC2bYhQ-XOM{GHzJWM&)e)DjSv{%v zKe%#n%FSYa3T*N35Iro~aO*7Ppm!gL-P~`3(gv-3_?6#T%8pj+fGqA14{T>U_IH+Z zf;+6`KqZqU_y4g5_>cJcZ?X{+lBnp|6;ydbU$Un;n@b*}GtzV%DgRY^qZ%i$blNy> zK6u0*VVLNn)lN+*wwBu2_@zb^mG5&}G*ECPWp z9Ee}+k>uY@k5>X#-ZA@(^%~pf0cARkx4mjN_6FgdGN%>WtGyx3HRhh5MQe$dpd7za zqqm=8^8*8Hum`+u5FB`pCNf_AV3Fby)ns9G=3-gxn0rq!5>MI4#1gP=h&U>eCkMImWkHpdfKD z7n@~==0Uf7X;MRLSW|w{tneS4IsC`%R&3)2cFWCAsb7qE>g2Yu=WdkHeg1K3UAqP}N65Z!!Y+Dj&MSJn23+UcI29?rP^-7E}dE|yK4$pw&s9ET~=ccBPc zP~Z>!Bx|GeqY;JfCf$<_fiKE99WI+mOWYd9lF#?O_IE!@*(Nzs7VI|m5{ud?HplV1 z-oS2GKaOJ^($rc2p@;(9Ksa|6uI)Tkd^M`$gU9Th1P4I$saSDQ!O+_uJ|xJnzuIg4 zs0VLFJP0B}1MGb&<^cZ_lnnH8R>8u3<5Z&fa>d+RPfo6rdrulQ&uU8e*h_(R+6_2+ zE#ePi`$;_DWJyi`gj4za7BMrOl?LA!^c{Yc1@x7(QHo%9Jw`Rp?0S47aA7toRNcDY zpowz)wP)kLkGEe3Kh@3W{^{EZBikb&>9t4mseDeY z>k&P7PcOtQ=}3dG(i(i`Wk}qWE{&+%4$GByy6HS5{c2vN;o0%^kd3jAA!-EpxGz95?)C0Ouu!{y=#4+`<%7*@9cZ-K6n3Nlt_~I zopWZ+=NZo!s-3vomNQ*=@Rp_0ExWv-hF2 z(4?RxL+C9jXWQc{b_zsgztOLyDSWVK&_x*cf{jTGT zHC~?yym#y>^;5g$wHHakNfh`qHQ1bH%D*gG=uUo#>avQf#r1 z8h4VPGhqU;Kqf2km%fsgjN6y2Cqqrz7Z^Yk-2c&x=LZ(${Y+>A#^QSb=CK@lN-?DI zrcYo}Fjs)fov@};o4ItEAKYbtt#3zh2`Tb2eXm;Don8F$3U~{F!7QdX$Wx>JK&V%b ztjwbtw&jSSd};Z~!}^3B<6mnZ9B`mxRqy59vb$y^4%h?S8X$eu_J_}hQbJizi{{pG z;EQ!P1k1c`KMBSxvjsu9n!^y5X$fq~iE`Y${Z8zH(vOli)@Pr=!Yqc8qLc1AahvJ1 zMI0x=q1|x|k;6?_VV2CNrdu_)2(6Auj6>-8^MOI7f{dSRHyf_0gC2Km8v%{jcT@9c zC?AHevUV_F6#KXfR_Da`bqNhQtA;2=vx}**^V@L0*IzAu4Ak%%2|ih7b&eT5UYAZh zOO{ko;eE#5i2!Exz_5OQJ%V-E*o(f5nbz-hGAsnRmDjPKY-oEX*N3A4&j2!N{g>*- zFXOi4hPU!+Jx4`;qJ9- zFh1~+4Z%zBM>%*sa_B$Y3=t_l&E}YNmYocXdsmyRcrGV(X9&RZy zFtm|SdfSBN#pd)0Tb`MMQUKS&S3~i|A;dy>f(1k!Ai|A9z(nDFYBDwe z4rvH2@X_ICk+V}x?%NwET5*qC6AM)7?z!75tLXdK6W&?hrYmOBKi3|ZO3XWyUF<%! zPk(tmqoeaG5NoAB=ye{RS%=lYONAD2p>zk7F%`3*^%@FbZ719|_U%%M(;st0XilB! zdwZs#@0AU^R8{3xnadM4C$n`BJBPvcWe+?WBv2~V%u=)*tE}B>AX(5463fwrUy+zlieR~nAmac>{V?H!jLARq}0W#>pabt=X%`%ZI z255A~=LU=eXcl8{)wg!0yV!e|;-t0JrS@;=e42BR)lgljvAX=ABHSgyWc6m4^v!c7 ziko3tv$#3Tf*V*pa0)OHCJIGirA#?;$B-*6upRo3BMjQgzg-W%P+M^`-krzb`(#k0 zX<60QQtK^s1$N-jxsUFL3BrA_ac~~YRd^(VQe3A&y+~ax;&Y64j&sZyaTw~w`5`Nb z&ue$bnCRA)9v0_@Y5kgVUUL}~7bSC^gL0Ed%hP;KvKL}l^c3y8IXrgh-o&TijW4czA4-P)?J7&U5f$4EM}u-Y-2wMUYm7fPc=``%mGSS^ zuCl*9^x6ZD1t15cf|-|Av zKEsigk@5DJn*DvZ$&u=NIw{wgUbd{)4YAnlRA?uH@z9LGOl5)MQ{UJaa(Kb!F#i*| z3M+TsJ>U9+&8AqgAbaP`1ys;*3h)pw0-v1)CwU@(v7BCpB82i&%+YW=8;2IG~))!+1VRY?F2 zQCbwUO&@aus%Cngn~l_?o!I(Vyk*k8+^CPUN55NhOMO(R;H~$1o=$IS=6i0=1DFoF zKuLy~9EF3PU80l|0v&11W?h52l8=yx@c2_YOnsyu~_@E zA;Q5x&9-*Ck7FrZmUh}|*ygSzIIp=8@(N-HLz*i8bmHl5nc76|`rbv6AEU1L%1<8S zw~J3~1||tNiM!)J(Cw)rC+1UQFn<{Z(1G-?gT>T+UNEPRdCDYDrh2Rx{_a)^}-v53nh zE7-dI3PX&}-;5Gg6Ha^se+Q62@GhW*f3;;&ERtlV>{X?m)2baW{t!$rCtj_u9}`Mhx{+H31x$kLh`!YgsaumFPU{yVS)4 zzP$f;*X2n@26yiNwIaqNxhT}=_<_YelHF0PwI8wQVSzj9V)UtrH5hZ{ekgQ^Izzx=zS-L!I z)TB0y&u>^#C^&ovjA>LKs|US1K>-xzc0psP-uaWv{QQ=@S6&J`=%*77G<{OM5N=?? zhNkx*qLRVZFAiSPgnaTgn}Z1q)>M@dV>Ts7oOMN$;w?7?AGrf;{u@Ci99E2dP7GNM zvP%H;67GeFMQ^_d6DUSD-)uHNFRAW(#pDuE-YQpWC#*i$i5LmKc+n3`Pn@$LGzM(L z?e&L~31`2(xbwc^+W1v9dn*4+ZqTgJ12gQv5)`1Cd=4Sts~X3polNd~qq!5v{#;w4 z!4kK`o>R6TP?}! z)o?7?bXgV7*p6tISABwQ3m7q^#SUEDWtHwV@uTngMi8bSC#oE}{ou%5E;DU*KmdFS zJ)T%^4%oWy4p~dgN7^Rl+^D;IOF)G~lpW2LJxb`MI^_wbsgyvmV_s?3fKQafn=2z1 zRT=L(N}y)i`MDjOg*?a9ad0m#kmVnfgdl@j7SC(qjK~Cm__QHU_rB8FaW6+K zzeC#n>@{*z*Up^+qta}2Aa0|afk&X!HD9aF%=*%6fmqC*7#&4f-ENy$3ZAXqqTsnxB0s*;U4e5hT|dGe1OypTKW z;^%~VC-16$%U&@wmwlrn6s$NUu=00(DQzNQ;xii*kuE-Nl83$peAV7&lrYN3+bKG9 zSM7cdi`OtST0{bt2W~bFFci1HgJ=hE=w?+6ZLkK&}pLuB74P5Y%wTR;PKOt6iHF$>u2u7N^-Az-yzrnq*9G3J( z9T(rwD&g1XxwRDE>uucuQ+vH=?6E2O{1!ahw`*vLC8$hofP?jY6~=|!e@UPLE(A&7 zDOxgz3f&n)3O`hK7G5*j@)GpY+w&`|qVBl+iv|`IiJX}K2zJ|vgQR+s!)&+2153}n zWBYpgyiOAtSUnvAT{Tt~{J;w5&2*1dhHB{UtnLQotlcsJ45c)e1J=i%%}e+bqim$o zyJc6NCc{|*;4F;_?Yv1N#E0j!B6j4Ys}~^$&zwIrNoLA>3vSzQN(uLP!h(7Zz!Idj z7&0Z`z5RNUu7ZntyQUy~2#e3k$bZHBnnPU)Q9g+KVr+;S>>dY|^(Z4ulbWgMWECiU zdX=yrR;qEdkvUwEap3f%++tE|be}TzC`ekf7z%W0^v4Evi_|6rA7f8`45v_;H|;%9 zw0br5+vWNKTZ&55z5|8Nrw0ro+4&}2aPsVP$#2vP|CY`DHqQD}+y!UD5WizLECq1P z6qX=}Oe*Tp&@OtjPb`BW`lnwX9*nD&jxLduL5xLx81?m8Fa8h&{T<3x`x}QH+kQF_G)wxNS?+5l#+CKQ)go%O8Hb!BmF{{67WXuIi>EOF%by3$? znvv3nxqyJQa-!Z|q~ye40gmAN!9V*spUc9{rq7%Ay9;>yrT1(xeaLY==lbItbL9Z!;(@lL=tRs|E~fyWNG-5O)ifmwayX*JdM1pU)AZ` z9vCWq8U9YkY;bMaxS{?S^E6aLks)T{7DR*qF^Q6qHnCuiw>OrFaQ~`aO-`2@eQt4H z&`kaCS!h4yaVwl3kV}vGj!XAxp%iE#<*fGX0mJvpqA^JaoX($_*TG$3JO-6}GqBa) z#!UbCRaYQM5g0d8ArAs%WW`GHC1F{@;Do_l!M>y3uzlq71LW)o*Zd)-9dwXx&fsO{ zl!xsTip&=5soUkL^X2Z0#L07o5vY9E!3$nQAr2pwejSu{A#b>4Lg3JthGvCgjKr)j z^4%%J`vPAQBX8MpsV{T?=APlCRP3LICU+89xj5Od&N}e^e&u^QdjL_3(;3-2HDOkq z<|P@grQuPUbISUJo2{Sl-Zax&{+CYKoKfMZKxQVxFA|3NCJa--&0T@$q|}(T91%tf z-EQS*Ro3)k`*np~9!~@-HlC7lh99%|2Fb@oU?PwStq^i6^rv z!7Ypj7BaH;Q`3vv zYu~pCz?^vEZ}*)4klgOyEf3wYAJcExJ-E^Xc5475r*qc=pabaKezI-etSy;#N30!R z)n}|?nt(OdWYZ!jTw$nKR`%jt%Y@Ln=PXwF1`&Q|8GZ#MvryR}FUubjpcS1{7K#HU zUzKJAC#&DsF|!b|XNmjgn=}=Enj8tEj*W#~s$87}{Wh2iPAeUB1d1Orp&!OHg|dVh z#?Tb02;LSh%24aq6o}2|$>!MWcX<6Z@q`lp{uQnH3v?I2#R8m~ zsPoV|t(BS6AcBYZTHFN@VspBXlUH%%nD5Kk;80T*u*#WD3Y{e z%tx5ZI|4>%NQ!1Mv88b1(CUmqVZc;;Ss8k~HYgDBwd+^+Sv3MG>?hkhl~ycMxw`0e zzN2oCAHCa|_U+EiWW}Pqkx6tBs}n&viGTs~M`!ys`z;I=Y;VIh^R&Q-H%RiCZ#$zs z+v?vx>iycnGWS29Vxg_y-so8b zk5QQ`Y8&APSGM+-)#v>@UM3u1ZW*RdMfGklkyFWqges4 z#4N)2<#1t|Tv+0a{XnQ6dYZa5_R$MuY=n$WbX$gm#Q2_=XEB%R)%OVF9~)dgkO?~^s}v*6V4U&1Z#yn+JSKJ0ZGJhr=vj_Sdw9qN@;lQ?ZOdO(HmTO^{|+5CMT z6lWXlLSc?d(wx*AWDa6^D@OK1E*_;&Yr2RMe+?^tOAU4CJcE0c zpCT8AqPzmzaC`W-1I)|^tGvSuDiR>*Q0$zQCihAMMWB4pq-yI6zd_h_d7XBH za{Ks->5Xf1Ro6ASwunEF)3#T86-(Z)o+c^CxJ%dw0)GMlZQ#Ex5owFZQpZR zd-{g;X~UCg5@FATIywVbg6?25(V01AR{5CfxNMsp*`!JM5a%WKK_NzA zXbiOl%g@AZiX-s%MY%{ z*(@U)_hiHcCv-5Ccyegs16VWRpllTNgAq=RX-Xui%o|eYP9{G{R9DIGl&XSoM9x z^E3u1eVmdFrp=^|XMrSL15S+1)*G5Io}yLXxSiwCEhdVJAGV;7DhQJ zT)30&m?=)+54t{eN3ZKp7)N9q`o+luFm(t{lK^B0`PcSrC|Qbm>Tg*p{}S6o>;(JY zC7%DxzqyG*3|WxX{)t{{$T|oh-nCBn-IB(ozjPgKFb-fu7Xck>7G~E0N$nswSaNw zLlraZkDu8X;7pJ95O-klgisEiIPcN=M2Idlt55GoM1kG-&wAZID!Bjf{=fU||BzVx z->o12UmT0CbP#Um@ab#a8GuW#2?_d?K`qBhBMkanZk`=E5%u-dIbups!^PINJE}aV zY$oaj6MQP(y}czXo6=8qhcTZvutWAFyL^gzGmEch-_llJ;jy-eXCr41OFX?U2dM{@ zj=bR+J+@6`3)aYVu(an;y35wI4hVF&5TF#?)-(HMlDD3Dh_4`S!_Ny1g_71=8vw`bK9LC&0?qR7x5QJ$D@Wj`a0^?mnKBN3}Q zxzrRxO+lj>x2hIG9C{06ORP5`VVf5ua4ik;y}OsM29FtVR+Lw!Dpj_LT{tPJ_(YXM zxmIn@^^VMp%^NVa`gyQqt(&o67nne790>&~aInU@SUG#uX#>A|wvdBGweLq?4nc1V zm*)US^9b0+t6W(fFd+uOM>y9Zx!1GqFyQd%VN>S&lTG;COy77_T%Do$w7-@gDc0cT z;|$@?X;#~|YP){e{-BC3gK0VoEGhxj)sYf9|D7&_5{E{}FagknwDbkdecQK}VT;OX zlhrj@K9>BMk^+jU(G}<7PrS*snnV0i=v{@>!DR?CAJ>_I3{Aw%+kH1>^~Q&S7}S{; z@U+5d>9;3eC;S+=5T)G|io09PHhrj+2G|lMeCV-kHfVnY@QU0RD2IzfaV_}!S?IKh z<5H%l=A4gMXIp%q=g+-R8+7()+P&ILg(7qkYGZ@`5(IQt0HoYJH;)hm7)J7k=+AHc zKw5vitalmqlvOfFglr-CPc_+Dkr+>L+e?=g98lDmqSrvxAKH> z`# zaqo;wOX@$j04D;_N6vZdYy_4TK`~NTHxzaT_$tSDoXpfc%>o6T8ysnao9w^P)gAC( zxDmEp-T{iHH33t?ZTIwcLzu>a%x61YwYgXZDo?VqZ$COkd)h6>Cv{GnM+`*0mr(Br6_pKz^(vz-!pXX0rv;So*9XAnr>{ z&c3`Xlz4BvpAs%LK5&CAiKFrWk}T^{030-BKvd01YOs|Hop8^k3bilq+xhv!R8ok9 z$i{6Ge}fW#%P`knC!^;&(d*D0>j?8Ot2*l&Wu0+rFJ^9Bi^=IQ%}v%!17XfI1~fp9 z7>1O5Uiovv7R(Ti-@?~{Yjk~Pwl=`yWu1B&0B{+#AIz32L)}-PIGMhmVS?DX{hlpp zZqN4?ul@JVKI(D}3J>hX+IYXH*EdG1|hZH^tKIM?h&dh0#2x>}uv-HrFJ2G(5jZHD_qoc#p>9bOUxWC8IP z!_(%wVFL5E?mh!leCMO{33>{3N8h#`Njj*qGi&wri2~#8HaEXIE>B-S8k9u1l-dvi zoLDjQN-t2ZfG;cX2IWq9fK9UaJS?Yt+;Pl*Yu}T-oB@sZsqrhd_=aCe1szGpVF0*5 z%yu$Yh5U+V&E>wuCG6vg`SBoB;{oo0Qu_|hcclgcvf5vH=G;P?F_22hyviGJPuk@r z(N1Ww9)E^X5oDlYx#X11Nk7&*icgID<~%2$dbegJI@aVSDo-4=2A-Q(Zxu>_QLYNL zVB+Btmot_(`_A>eZR>lw+~xIhR^%X`Vd37uc&6Q#^n1Pq+_Cu!xoV-M_Y$#d-`_vD za&Iy)(?4!)gKAIaZVrep9kQo7wPtz6k_SNNzg6;UH>)d}CCVxtQ84Y|4H=l4rp;=x z6FGHZ2M7o*Q(E2in9u4EFCY`!Y(!a0V4<5*X@}Oj!uY`(ILa#(ambLq6Lk_`;k95q z-i$I_nH8pbuc7O%)s&$@7SFVChZW5vV(Zw%hY!|4Jm0?aD5c369^S4Pd}@Dy1TvbO z+PVb#*0!6up&_d-jmEa|tbQo5^?ju>R?E`;tPZ8}wP^R`B z(WPCNRD4S;9ioi4t2ZjiA#*kCWX7U7(BItonKi6EtZsK>$hP1Y7cv!P_R`8A)w38s``V!Pf@0)i z>y1cZX_p&Eu75Atd>3RZy^+e|v8Ai9zM_r57zMi`JKD1adP9mlTJbNjJ`rKBo%83D zx7bnNUb!srz5R{R5xiddw;%Z=@3y0-LTxjBL+ouA@)ORyn7_9K!FqsQEa2$LAj7-b z8U(&+RJf_1xZ_n(SN7m?l=l2)$ z22z?vvyd{ZBTr$hIxxhm&VvrFRgu_9n;wwkw1fJDMNx=8K-Wlrmt6fP)FS+!ksYEy z5VqSNWS{OJRGQ}caOKCJp1B1V!u-{t3C#yKu=j-JXlxYBUIa4(8hn(D|3AE+zvlq` z`QPeYpxiN;e-4m_?&kn4)qxT?*f;Jy08&8rQ<&*PSsdNqB?mAMRT%Q%y7nBW2r=&! zfvDSVvkkNJ3=<7Ld6uwzpTGu~z-qnN)up#i%Uch?e*U}dO8zGYqxUUKp`$E2mH^n; z@-RfeZ=(|CE2w3}0I(y&`SrELu8b>F?32u80OCASQ$A2EJ!@-rXzWtbNGE?WQBQl^ zd|HjeDg6m)%1JQqxZ9W<2=Q+(Y2sE3EzqD)-?GsFm;*iUSv>?O0K~WVz}v*Fpg=5{ z=fTXwQpT~M>Bit^%;k&NsSz`+qEm}#EF%OmBogonL~KF>2kNlMQ@(cNGvlckTIlk3 zy4mDql_5fNqapYElH^|+wV_17jM0iQ&S`|<9VVVZwZt94Ms+3y-cBF3K22~rcZB!y zx%A*nT~nmxiuuWM?H!#c5cCTDfUW~}AC7K?A=P78ws|aACuNG2oJvap7Y#U1j7;Yi z?&PHXV*6GRaRF(967Uh~Fr(>Cm?lGWDRd9!y?G*?A5u6Kzz!i=`g{Zm>MN>7j-Bw~ z=e0aDnN*VwGfn!QD1uYW+7SBd1z9oRk4arj8Q1~{$79!ivTbNq2A4^yc4Ye9uyto$ zzH;_bR?Q{{{W7aNB7G zWK*1d$n4P3kTTlQ{9r@tpSsYpSQ$A!)ux%r538%oLj?|kn>qP}GOMOkyeP@@<@-?j z5tLgURPcKCB}@gK=9zcytd+=JM;?vS?=@R9L+>sU{bVeHKy@r&KJX>F1y<%+F#r7p z6on9J^0|o#*ZN*5k<`v|ykC-0YR&b9C^t%Xn(F%NaHT>R!1tSB42eMy#7qO4j`tG75FlBff-!$t`Lk;0b<)$HJX`#nvODQi zg4bdN8gx-?#V*G$-GPJIB;_#8&pH97!k^b>$wTZFp%hGc976)vylFMRhi=4RgAyq# zjhnQJd<2r*jui=ZAkib>w!MoJb>43;`oI?Modd=xH<*F5&(PIeL2SVO=S8{Hv7$F^y)tc z`u<5L`=?e&V%_qQKq$$3>BZ&VIQvZyv{O=4vu7_h8 zvB6#I0AcI){k8FNL};M)UtEkC036-FwD#gSF5yhYIfuFXRY6m zw}z@3DJ0zacH3O}?l+fV7*@=QIK?neC3`Vhvb(fNJpso2} zGmr3E%YuJxSamAP^jJ?xV1URYu7?5Ggn63K@K3f|&L99$0fD+u-mf1m-Kb@tZ*)19 zWuG^<@0a(W*^^pWT?k}P4`cNaDsmX21?1w_;AA?TjgUiknCn!ev^cq#C>iJ63Sgx1 zj^!wYj`>sx8Xj}e;(Nbvs~=8U#mK;?I~oiyM2Spr2&|=-V45zVRgncp>L^`zuotzG z@Uc%{9ik07RvDuo?A1HG6F)sY*mmxm9ghO%$-iB^{8zVBz>bA>VHZN^u4r(%@2WsI zJQX-YG#5*f!E8PL`U+MIy&EwcMhlnAZ?B06<$B~L`%I~AVU#~9aQTJ*ewcaJ;ty;V zVi-6F2;{Rqg0n6Ug46ZUjR*Ght>;#9@{l_ zYQb-Ho~Tq|jEH<(;VtOwN!Oed#&wcVtUPajg94Jp(ojF#U8K~2t$S?lD$%_sOzz#b z=2t3_IR_Cbju?u|QrvpD9?ChIh-q_4y?sx@H$V>W&E;q8a>);SMSJmFUM2BqTPRT| zUBwT1|60FVVPEPRGuU{_aXa*ge%<0cS&xb}aU5LW+hFd^*yFB8K5G9mn% z2_cF~cSd(**)O=!b)hAi-@p>$SQWK6G4#dbkApW|Dq1}q57hY&4f)jz>L3!Z;0Ob> zr3xX2D3JTfGPA*t1v&iwSdtKhS?6q#1N?I_1=Gv;X3i7(3vZs?yt-4~!wbIJDrmpR zy`U6IKR5&gx?88_i+o-nkStZ<^2CUcag%L|BARPyd6O`8OlD8$!dOfv`T#1JSq%5@ z!mALuO($=*<@Xfe!(3 z&!ZRg#t$idxw_o`F=M{Pf8G5NSHoUP>Ou$+KAcIbhBN>R9e5+YtcTV~@H-8P_S=w| z2dRQ(o^QtSqqBQT+c$fNRWZ+DK8=sibNss_tNmz%6gc3OCV`fVKr9Q8v7&~p_?dAN z>eM`5BV)?4PF*fF{_5qpPv45}j&hy$VOJilZkBtYikE130I^d;ok3s7#FhqG#L&7k zMc_@D?JH#B*=VcEINNH_md$08cJVxpX9^=P<=giaZT(U|$55}Y8uWE;KjjcK zXqBHTTRVXD20e)p3wNkiBv_QW@*H+sgD^uBo7yrc0v#k*4pb}zc{`rFovCMK>f`7C zf~`>A<=Qm*!Z@oBA<+QzDync|4oi^=P~#MR`{e9>)dLs$Ock{5XpkhgL^z&~?ZtZp ze1Dv(y6B;>-Sw_^;qqWwK@`>(4ygz&^i#daDlPW0&N*#Mw=pt{3n8B-D3ck5h2G9z zwteq-sO){@?lGCN_w1*W`TuQt!;1`17I{0SP z#oe`VCThomXYKO2iJ0*K9XSr;H<3?A6EZ(_+0kg`O98-h47+jG$NXGCXUW>oMi(ub z`4F_H#=f7L|MVISdM7ep)H#st;~UufJgJiL; zVtXNQS)@pQkRirL>x8Hi*tnchp*EqRmR5ykC%CjJBHv}N6X|7{&TQpOCExUf+U1mO zu&FaUb?l7Z!tp|6Kv!F#eP%wa$Qf#BCX{`BO%#$vV(lWhTeFq>m*(SQi}n^EG-GUy5UZt6SjLYwI0w9c(pN?ct&F>eiU2}>-x5g0aA7OO zkSx%JLx?g?_n9k01AR@|>OH1MG;;gK)_9a3sPruqJv&o=mJx@4Jh^B2)|YL#8rC38 z1mlCj`?7e&^8pM9^h#ZF?hGADdopR)1VILhv$94wWvCZMZ>@Ox_;ZrR<%QhTso1n6`)|K<{7H%4OB4db*dtxs2e@& zE3!I+;+_awEld^Nt9#{!j;Euw{48g@t5r2uD(WCv*8l3@YCa;8B}l<7qyw0>$^$~oPQhE13HRykkwXLkrblb51<5T?95t5aiXiMjA(;ZV!Fk|{$Lj}U1= z9zLwqPLM#wm@86otyn37Z&u>w(^ypDHt9E-H%g2+l&ke*z4~9@v_<4r z@S69!zMNtl@uNtxk_j{r*ES1)L>uVU<41^EOd0F~pefy!0h#u3^uPuvRwG%#VSoG; z0-%7m08r2IT{6Lx4PK|(vUVFl;E=Oiq8|XW%>}ZwbgRFEgf z{`RT-Au!G{oS!fs_b*87H%tMJt{xDr5R@k3ZSYPW@yv z9~_4b&(%NxR1lUyZ3O^ADDxSH5{+Tm5;>rBaj%c;;D(uzYbdc63qpC z73W!PnDRlKnHJ5rzr1S)N{xz*XJ@tcvUYg~dE37!U9)xax@U-4%=Nr%s;ZK})cxZQ z=O`sCK`qLUJ7J)F+CGR@a&@u9|743qS&))F4tz66E%#Z+;%V0&okzcN-XkX?^*QII zLCaL{YsY<~8WB9%+!-1L@0mMKo;4S*ul`o2^>WrQF1SkK-Nz}Z4&5!j;SKj!n*Ywf zU(o*dV))X3Lp;A{mE&T9>WhDdkNQvlj;m8DW|Al8SMv|IqQ8fac?1Xk?o$6TyZKEU zf`k&YvHTJS>T?=3=V4^qbtiLkic-S10R2Lq(@(an>%8hJ&^RFHCr-w9EeKHfTHfh@ z#gZeoeBS4m)OW>^dTabi^%TA?+7--QG`iVb>SjFSFg3Hu(Xz+(i}K*rzK#=ymwvc& z?FV}LAu*>Zi*<(h3TxZBmq*`Sft7o;#Jj~3Wm+>hQgT*C@O8d!;B7X*5ig4}Q9Bqx zKZrc#n0^g*>FHwJr=-ainaG9g`VS|#=e~ys+#PN<>R%!=}W>2+oN1KZ?JgIy(J08U55Fe+1e}?CP>63|zWd8_i`xQKu9~_()=oPR^F(0 zbzP};>&=UTI*h$HFYza^fqf%70~ps!8Nxu16cbQs9ay820rT*qF@aT6uq^(IBis~- z0SEkEI0#I-;EpgR)o@Boo@^H@HSnYB_3B5E;}D1kbX-5 z-QNXayFmm@^MCOt&ZGQXRoECEY(+S`_Lk?%YV-jC6y)UIPfE_#9V_R(w%l< zudnCxD^8D7_j^q7(9+u3Mun=b2~BTFV2JdAkQg|2va00j%+;Ai4AH#g+I&C5M&Z$S zsED2)Zij2G6TX{|y#iQhFt}3>{fiwFh>_wS`<_0^XLV4NbFOrx-`%Zs-JsD(HkPcA z(g>?BcOuroDM#%gRma!-JxCmF5lS?Cx{PVX+Voz%l`6IKBKNKjjK+iz0z=qpVxR=A zRk1W;Le6hG%cwZJ<|}heJIE;E)4`-I++qh;TsQ$7JjO`8VMTD>bVSAkyp!F>8$nf&B z#~Ht<{hTB)bU)Z#Z&3E~Ls;!Xn0HS2X{mMb&N;L>MJ3i*k*ZvAq2STVcfQ9DCkRzL zMrvXM)1Cx>ENKB#AMK4H-gA1bP1TAHDGIV-F|sn_E7UX9*w}+sm^~@%#bhu^(Sbk* zB7JZayCpa=^}g29V#%c$!sC?kTTGb&j9=GKOG`!$UVay-Hy834a;|9V)1}e_r*-?- zvyGinkZ~+tP#EK^p}hH`Prw_J?(4eTRlJQXITm*R2|K;KT5B}eo!d;k9i<0;OA#z* zIPyTDH@0l*c2L3znZ(0E*Knqf9<3~7M@P1SYIGcim=01P1n6`4QIaThD^pOBxftzZ z_ys=q!Vf2Be1iM#Tn@#p3sM-2D^{w+GIJR+3WsV(<_8Hrt{!mpL&Z|$(tFq2_1cgP z6<`6Xft4U!E^$Z8+<5r2CuhlUu;0ZwnZUtM?<4AwU-d@v(J9k}clPsIq#jietu-5V3o6fW%YS2s zLPNku9RV+wasPhRRD-Ahpe4jBdbt8elGo-f-xoLC5_I0T4-Srm(#-$s7c+d0`70vGl% z;8fsGF$)+9UPOC2H&hs*FkP_Mlv(#O=Rocw?WLF z@&*E_WcmvNLb&N_`*A(2NL<6v43J8CPDCEdoXFtpJSdhl1H0!US| zl`vA)1QXC~&_ZcJuRUKBS7z}iE7#{fRUXs$$rf@InXzU2$3uJeU$kMn*nr@+6j%IK z>s{hozYn`MwCZcqrjBWWy~UQsE!5IY(0F5ml2;7Aet(r(%6!krmCeg}p5{+q%`AW3 zW>H;PV?CLA&7n#is17#=8pN3Q;Q_EAGOY$efHq=6M6UM%fr4ogOTrd=w+uRzYFmDN z9V&C7`UK16Qf|xEtFau*1QIvSek6}ZrpRI|a3ELkCh#%DdMoPJfto{>X6&9Pn~|oc zC~{>`(!{^NM2!1VeCi&zqcNU^n~(437wmR)W;tv1U5qHT30Z~b8LLx#Pl55PMjfPT zF+}4?h6{X@^p>oUrMA?h@JV@rp4HlD71}ZBC46>on37*lP32+XkQ1aw=PVZmB0T73 ztgk?AacU`!AQmjN0~Nx|1&3X{v_P{frrCl-qqODU7OqtY<2aT*WTksM-CLPh=^^{t z|A;-es`9M9jGIe+?K#Zd{@{hr#ptY)h!|t1Hm#9p7VkkSb^@J9--|jOM=WVxirEBT zrerm4?TSG zGjLUbHJd!%BkJCl>vC3q0d`VmVX^ZA>b=doc2}$09SiuR7Y-S`C!<51Y}`U@7pwd# zx4IjS9A3MZ!14wytmDD$IRY1~xcLkDDDNJY9*~?Z?){D}i`wisSoUqPqNCO~)AQbp za{OUWe&14dS>ami?X{wHMG*p&!z|t^vY!cyhx7(bAjp^dMp(6*3(y@CU!NV~;FrYr zSIo|I7ju!eXH8h&j?pd5@MORCeAcsn_lCpqmwBFJ3q6`qDasn|1CBMDQ}n|OLPZ#sp;hQN9w)}+iS7? zrdthTH6)LfmR3i;@b%U@uQRqm$y>0Is+{0{RL!1>w;vLMQJS$_I>4dEFb=yKC7m_o z?T{CuIJS^a58qjE=CfnPgVdrqsya(h2@oQzP~db2p9HYPndN4zpYQMgqpT71l5to= zJjzEzxyLO{{h8$1@h1iiA`fN`6hOcZ{cnGQ{!OQ8@yy?QQ`~%uQ`PUx)hVd#RMm_*-A79%;5}f%EC||;~?2@OZ$xz6DG+=$5qsV*jtQ@ zh^fuD%z_di+^-xG5JZ|x>vYp>ChSw;g5_(Eck5lBNY>ps68N;Xm@5!>bh&FgC}9J1 z`hon8gmV>eyb6iT!y`Zo*nVy_`x+Og?tXu4%M-L1`{rTb-+_!~@vVS4^rLh`So-6U+yJ0hq5Q;nj(R99a#`P%oxk`Ww0Gwa#Y@ zAmK(QuuuzF;uH*0GA|1N?hyK^;>bIT592TP*6F-*t&QP7`%0MY(8B&EBbKuSh7#yC zpIL&_gJ2{?d%$*{$z=~;p)KPdOKsc5)O82`AG4$Q>PBF_~pA#QF*o z*k|^BAvpvleLM)&EYwMVyl<=34#a)><|Q0C03>VYkoRDm4BqZW6_mQVaWH+V4SeOov7Cz-VjQ+YGQQ{wTAx|V+HbGe zyOCb`=#6p)&)5^ooj=(eb=t)iXgKjZ6QBfXxZbk<;q*OV%Td96p-Fmkn*%Z@u3=Wo=G{GHxIDv!D(lO7 zt*<3LZrqk#Yx*8GPL!Q`R^W}H_%1PoamP}4DlqJGSs!Dr5w21lOHEsnJnIrazO$m6 zSOLRs!WDuvOTc@}jX{*l^I^V=N5&%dJh>tGeQ2A6!jH*OEB&JaAJb9bW9Zl^9wTV(c1K{c#F#H}3p?4TmA&|&i?_@=&VeNktP%}SW@-lu86lK? z$9FHf`er2K>`8b3TYG072xY%FeBDT7N!holtcC2m3@s$pv?yg4LSjyw5q^_vt>*d!P3`?|I(y+<#c+=W@+8zwNvH zzTeL`lKk2VMBsoG8rsV)2wYNHAzQc|_Z~EI-W}(}7OT68we7`>Ct*ltOWe=71SvNm z7tc3BQGU?56QidcgU4Jxk8CxSN*s9YbBGQEk|T^WFpa^B4b9-}W3>_K3RYF<`k=xrF09mEUxyYWGz;E7P64kqKSoo=SiH; zMXi{BObYzkSEu)UWIxPU1%IC>OM|MEL)!=05Qfh=WTB8|rFR}y$Bw=ix?jrV!ja{| z#M4xRs)J`WsBp#9xKK;w7)e_oZ#IzmM`f9pXNF*hpa&Bws_L2K10>B-nSN`ocJG?0 zon8;R39Symg~3Y-KNEu?@X<5Ty6!uVQ}-Uz-=%Lwx_-*HnUkx+wVXS8ErSUl^-cF3=DM7Z;dkrL(=C+XPXxWi{YV;XHR zo@v8zN==zBd|GuTk8U-tuj^AdCi^l(K1@jbUV9#T9he1oo`LlNtg+yRt7X!FQEP8w zqPo+3&H&z5)Gxy>dao?S{g!ATs>^8%Jp?L`plbzyGKp?u@b zj+BdNSbRmX{YYm4Ip)Co*@l61(rnBk)RjWutorcK;hl4qU+A+d%4p20Pc*++0B+y< zYSbGr?eb_QVy^43GGusn(#53k7FF@CxTLS9VHU$*HpzB9U3^<;D;EP;z?Ba*3bl*% zO~^~*l(6?%Z#5qg3Jcp~!bw@_j>hNuq|GO@o_gRMKrI0i+nvC7jG*3d6*<(cr80}- zE}v~Voxs!aK^+m_!GSPT{Y<9wfREqpWJfYXLJ9%}{?gM?VCQ`z^7+Yw^MWTyGHC^O znM(|7<4p~+c-kux!5p!tmui^Jz*>H0s)}nf^>7%Y)(R^sIO&DMboa2a@YIDz--#Yv z2}LIVJ8Y)IXvh8DQhihmcV|cD}Fb>A^5soMo{VO|*cjc(vzU zhBDMh#@Zwh?j)FviGFqt4&p!F%I9{c-0@5ItX&TmAgiBq2iwIorcuq(RGKWE5>wIC z$!{<+>2ZC--epffZ3m# zV;W^vK_EGxfpB>4&V&-Iv`(d;9-PhN+d@XifX_Nb52`zOV=z^=5y@(GeE!8t*V~Vu zHMVbJ9c)W;(Qoa_WP(Zo8f3m%0DauoYeVou4r5Kx|a=%_>@ zVMO(#ydtyImM3}zc!75DK-};wC%`Ff(5cqq zgByr|B>T^pPfO!VbUmxVrPwq;I-kdEe$DJS%?d2ezvSKjYv}2zaJ)Pvn_ltF5y<+S zgbk-{Ck0PihntWr?%97(Hzz9yL=+Ec^zWWSR&$>A_2+kt=zp^!VTkx zs{jqiDnCHGU*C~jH4$?^?IcOykw26BrJTFY)l{2VR>{hsrNh_bwUG)hPG(Jh;lV4l zGdQ7h;0+fbwFyLoUwUOo{#BrpdeaZQUq2o1QJhz&bw)eF`Tlcc1GTz)ze-1mKP3*z z8IKzR_zqK23V^pYJ|DKCsX%v0i(s+&nMBCO!(S?R?{kTy$Y@<$&arx!gHJ%>FPxZ? znY5iQyNhsoo%yLgWni$6A$l(4QG43x{FkY~aDYnY;3rG0Q$jw#oj{I!C z);+D8)7P)a@@ryIyURPWkuRqgE5N#y9eL9eEGnfK&_zPDi137hdeT6UOh@XU^WU15 zKRd=>RIkR3(8RvLa!y&AtuD(urX(Sdq>dM?fp1H?^L3QfQrIA_yV5yDy_ z`r?Jqu4dwdn@0|hkB`{x7lE-XQsSk7g>MrA?9089SUfvw)c4JOS-g4pd`ZUgaLF?_ z#u4*piaQ6o5wGZ<>4`W`q9`Q?asx^PUKPPp6^h<9lcFVggltfJ^9z>?;w9Vreu0q* zL>WFfS5iN;QkMe)2f>XN7`G80g7h@;4v9*+`q7)`&IN&&bC}^2KVn4&&O6U2QJPpE z?ck#FJV({IudpMOw~W>OJ&kR0EE-ekL)%#}DY&3AAuglDLkoaz5~}OOC%CvZtWXkO zTjzB=Xm!{`jO){8d-m!MrahJ$RTNpErnoG{VoG6mmw=n~aGL>l6_An0TvSN2ZbntM zTeFG3_Jx6ofnx`XS>hW)uf_So_nbbp&~LK+xc+j|Z8s~6ymCLE8OlPGw{te4AZ6{z zW-IkPrtx&W0Qy}fH%R?)sO(vy*$^<%BE^-NZUzY=?yvR$ochD5*a7V+|8TuSzQ@fM z4_=pla}7QY+^6MALBQI_Fa;>ni|V)7256GvTII<(sGD|-&s85~L=p?tPmki8Ip~i& zw=%68a@0gSsGcxsvd~CQ zY{}R`&t|Zo>cPZzS4n9`%ag>@pvVD#b`4Rlb|qai=S}sLY;nm%R;`J zi+!0|=kq8L{u{Y3Sd-fAo|aF%GdznDmuHJS8?Hd`x5X1N?<$PnlW)T_ctUJ7$Z@ot zf|H2WrMmz+ST6{@ZMEa9-vd>yw=smtS|R0C1KCv`$K@Ekj=p@?B-l(ZOAE(!zZ$YM{9SJjg#Y5$}-w85|zu$!0CvpFkS-6N$`h$fen`+~5?}Ios{B_JGw)x3a0u zU0A2=?y1LI!8lj`N2kmuXH6~ik?J-%Zt98X<0HD)N{%Y za~|+_e#;0}eNNlC3iq$mcs5%j+Z+hLG~l_nysWJ6a;ooFFR?R6#f%<3O51YY@Qp0f z7F#Q-0^?)mPZ$rSkjhLL<_gid&WIp>Xm0UC-)YS3n7PU`g1SDovK(K(8)_U ziF3WAnX^0W1br5iKElxi*PHD)Zz(a+B6;e|a~zP&TLoK`AYE9?)-S~dFmyMb<;cQaq_^S~U-KOX9zAM%8rROO z7UD?Yl%cUFwJqnX9fYOa?)6Ak*y1cu>s@2$A5okr=ELQ*|xx z4x{YnbD}-lhdUQjUpg+)&bNg+e-yptB$$HFfr6anZl;II1W>#C31`YNS5S^8OOKv# zTwJ;jR3lgA%?oQdV8T<9`txFF9#mw&)w+xKLj|-fJ>o~b5->ee9g{&O67^ZIO0qBfqnV;IVndUt!l~>#Z`EZ?@tz(k$YqB3Pd}~>+4)gPK7t{upAXpab`PqXTxA2w}%D22mJXRxQUGZfv*S6fJ*DVhu0T3 zldTY?I%aeMU@8LZ820*OLZfvnpCG)U7;n?4B{<{0N* z;vRgEqwCO)h6XXfc3&0+$%-M5K^I2)U71;^SG1WsE0uA^49@9ZqN?5DSnynx+;~A} zQp`dEK0ZAL$#CbVsG{}+_)eHtVHCMj2#SY{)W&x_do9_hBg9jQ3oOU{gn~8q7U^z{ zalCwQ^m6g*IxnWtto_jvx`P(L;)T(Uc#}Xkp4D{;#kgSc%q^Vp?AEiRtUlY+s#p$o zYKh&fLP}-;S6KKO?eKue2PGCp3$EOTT(j|Du}*?=ePQ^O@`Lom_&zD_hYRwXV&J$k z(*^WH6MD?Dvk)anVS}FBI3O%AOg**PZ1(=xxvvpgLSEgYLgOw>Q?^J*u8Jm*(<0y_ z2v83}YugUd;(umxLxBp2cdCMT`0Woob6|Z1j6KlWMULLg3oN zucgyZ^L3M%Y~x42gR$Iv9PKAkogZKc`Y>MK42gu%_VWbSizQq(d%b>dO*quZh;X-V zXW##wvKi;B{Q-AWbmyVs%6b#(j+4 zt+y?g*LhrMxn;Hc5o+XMsX5`>9Kkd?uPMnv;gq@hs*wX5VQJ&(dv^=Ueu;Z`X`3iY zwCn>e5FnTuPns~;HK=Il&Mhe>TqHW}xHUHZL8j$ddA*i!Auci)&z zHjC-WFg7N;v|EZz4vDWyr`g zgp@5dHVQBKCz)rCBQ{KQSIP?{p4-BkbD~I?B0f3Y_8(+S%38m{7stHM4dlyA{+sXeKJOu z=pIR4qTSiWsBNXq7~cLqxA8wuV}9>XG!y9IzsK#Taf6Qidx1rKpxpLfgUGm%AeQ(3 z`$r5~<|fyz2VHX@h*G!@K9&g9T}eXP46r(WSbCIsV;%H1omV1}s_@x#xg7?`{+^D1 zBK!M0k>n|t>u{8LdSv}B@VJKXD_!mscL*mv@*p|4XXMgV%Uc;IDUY67DktCm4WT_U(S3s#)@-61lg5^h2JFLf+V*01_(O(?@s-yDAFmfq93B|cH%1M zU8AOCS=VuUFB_Q@QW_yFf($cPR^E1nEKeuH!U_bl zn8g&5rs@JpIRZ$%JPFt|+nP;&-EjDgibv%X4z|o1onL;8?p6+Ikx~#)jyJg$K^jP7 zi$*T=A!-J4Dv#7>45&SbG!5dOc_n&ySG1R{;$-(HC5jZ3L*C91Xm5!Nt=~BqaLj4N z?)2*q9$!;0F*UgP$Vl(r5~r*wzXFPi39B)^TQ-5k6Co^?l3frysC--D#E3 zfzFY-C%oHYe8_@^F9xMaM4DfI^7;#fw)YfRwZ2~Xo4pi=4r~YRm&5U^(J7$v@YCJ` zb7l|ycBUiceIyNt@-Xlf5(!W(U(iArf{^Rqw76mHFY96Fr{LJbrBoNYSXkNT_%0wQ zxM+4eULa-1abfY4mD1RP8-@nqEMk5Z>KG6jr=5T13!JDz>fHTV5W)H-$u9`?jo!bA zgJc1{`9?EkZ)TevKSK%{piUID0(gIl@(2psrklimb#U*{Z6go&Yj;~KisYM)h}Xa3 zD4cb^bl&r4HH?kD`sCiTs=`A^9u9r#D{&i)G;%R)5(Ws=1PxQNL+C0-tA?D zdyz>*PLAEnEK9Ogr9aVUo8SqrvA7$cRxs8*0pDd~)h+RCN0p*Rr;u~9rvO~#U69JC zZialGgUdqTOFTuD^33c$T@kAaRg=>Qhz0~Jt_2Z|U&{6uYOE>Hasirg0WmP%XO0vX?7xvUU=o$9R??($6yCNB|t4_4AnQG6qDyh z=)Lti>yw*ni|JF_&M_&04)Y*{f-#?$wF%LT>UW8@4NA#nbi=mka~0AqL3u?N>z~#O z_c#S_E$i#se*o+0X{qP$YHRcou5#yQQ|yB-F%1(ZAdnL1TnCG;p@VwCoPLv8nH)tu zH3sE0*zO_4R*+WON4F*j0oVxV{Jj{Br`2bB%=lYaiHm#UjfSO88MPN^01zu~yaJpu z3D*mnlJzKd5bq@R-oWO?S01jPqO7buw5{|cW$`ZFW)V$yZ?bF|IrGv_%=)@_n%5cp zNO8%i?HyuPf(wxjN|P@DzL)UK&74UM9{aL zGxP(fi(@=1EI<-{#C_RUzvXA9m*?{^vtt*rQye(#7C`*W@V1HWlm)Z0oev zX8K^WsLqqfo)Cq2BjonZtex97_{&yl@)Vr;hw`qQCtme>7orvy7AIU?f}MH9;$PuA zJ5l>hSo|zH$I1Xvk}8Mpz=xGRL%(NmSCIPLdy53!m^;Y6`&((RCgUGT9_mVX$|uIT z%0F^=m(aOL|2FHlRVVc;mD4qcE4wak4338rEvS;9C}x=nG?edNb2dPyhqkq{ik&5f z@4AVj3&l+#S#wymqbO`-Z~McpyVeQv8>>+3IIuT$!AMyfh`2K3A}_A#ti6_D4B%%x zmI<}}`N^LhnYYa~87*vJXTE;x5=CIWO!hN?52C5N4l>C~KQ>yDO}=-0=r!K)TJcpO z!}qVoA3`+uIe-)?8w7AFHNtYhKB$jCwZSw;cp-1q1dQK1QW>tC{@h{_R-?_vpOlA_+b;RH#_63yDN2I_#$mLJw2|y8?i zTh1YY^n*ROd#XCPhxb*rC1&zXX=FQ2sA6sZOIbDU{!tanqi_O(fb z3)D6@7%afP26`|FJnvjXsN2heRb}N?{&F(O6QAFRKNVZ_4S3mb-3p8R=4}buRt8KK zrPYVdqVW|t-hC2ni^u6Q^v4*-xS;~d@xBT!I0a2gF9?q{yp1 zk%^UaOm5uX!k>f~hd058-V#>rIO;iEZ-YwitGiBKPrfj;+>*joZ#p84%=@I4?um1j{Qz-3})_o+89O2u0bSk$gA^vA&@ znkWP#Wl{o2Zje98@o*)H66v{!?aF()7hZ0A?8>7J*_7J~?jH*|Po6(pG^dz$Aj#K0 z=ac-8;|c#W&~TE$E>4zZ;AqP5gEtyD>UgNF@bepYe=EpP%(2l@8!(+jF?4~7{=u2{ z?z`XZcE44(#rZdg&s$r6qXPMVo&5AyTlN>%zQO{>VFxdKzRvD_t>rJw@*U#E-#seW zR}KG>FIW(qbYqN`vvm@L+pu9~@uRqaTVoq+R)8_4&Nzf)#{J#;drUUp3+J?tAmvXj zAA^%+gxE;`5`2`u`Ay&k{Tk``fAadDhJ*aaVOrQ}o^KIiC`{u?oI`hvtsWUk+lgzH zU>`IPp&c8LA@dIg8|*%MyD-VH5QjWC!s+Kv78g}Xz{sFwttSaPvtGIlf4kX9)iWO0!?e3 z_?d})wkAJuyvxKK9N%h~xyinKk>F&OcCR?tx0~-sxObEva{o4)w~w7H$Y;RN#)1(& ziyp?j3~#dIMfot;<8tf*j399`TdU%sSEuZFf&9_;f3Xf$)p=#q6Bm3)1EBcEin#ew_ zyPK6x#5GmJAqxXekPR>WGgEyU&F$ViV-Os99Ne|S7g#dL@RR*24|37&0zm7=vK@66 z*B1B7>p6el^{&iVstuO5igc;J7{`Vk2_u`y1x` z;;oqec=SJd{oi21cxC9lstV-)NjGz>wI|Lp@fVXE0a-|Vzq}U={#f2W+{ymfnf{ju z_+wN3-xGcRttL!$q4lu1Tda=JU=XQr`}42`@z)ZykD>A~S! zZyor8_&a2u0EK?mH9PV!{N1h}vd$Se%-@31vv_{~e?|-aVRg{&tiS&$fB(IR`l0^- DLO^x_ literal 0 HcmV?d00001 diff --git a/docs/source/index.rst b/docs/source/index.rst index 8ac09f6988893..fd741ea5e9766 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -114,6 +114,7 @@ Documentation usage/engine_args usage/env_vars usage/usage_stats + usage/disagg_prefill .. toctree:: :maxdepth: 1 diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst new file mode 100644 index 0000000000000..9fe714b4fd856 --- /dev/null +++ b/docs/source/usage/disagg_prefill.rst @@ -0,0 +1,69 @@ +.. _disagg_prefill: + +Disaggregated prefilling (experimental) +======================================= + +This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. + +Why disaggregated prefilling? +----------------------------- + +Two main reasons: + +* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. +* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. + +.. note:: + Disaggregated prefill DOES NOT improve throughput. + +Usage example +------------- + +Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling. + + +Benchmarks +---------- + +Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks. + + +Development +----------- + +We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance. + +All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``. + +Key abstractions for disaggregated prefilling: + +* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**. +* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer. +* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``. + +.. note:: + ``insert`` is non-blocking operation but ``drop_select`` is blocking operation. + +Here is a figure illustrating how the above 3 abstractions are organized: + +.. image:: /assets/usage/disagg_prefill/abstraction.jpg + :alt: Disaggregated prefilling abstractions + +The workflow of disaggregated prefilling is as follows: + +.. image:: /assets/usage/disagg_prefill/overview.jpg + :alt: Disaggregated prefilling workflow + +The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer. + + +Third-party contributions +------------------------- + +Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors). + +We recommend three ways of implementations: + +* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. +* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL. +* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`. From d263bd9df7b2f5586910e5d006a11ff11ba7c310 Mon Sep 17 00:00:00 2001 From: shangmingc Date: Mon, 16 Dec 2024 05:28:18 +0800 Subject: [PATCH 35/56] [Core] Support disaggregated prefill with Mooncake Transfer Engine (#10884) Signed-off-by: Shangming Cai --- vllm/config.py | 7 +- .../kv_transfer/kv_connector/factory.py | 3 +- .../kv_connector/simple_connector.py | 101 +++++-- .../kv_transfer/kv_pipe/mooncake_pipe.py | 272 ++++++++++++++++++ 4 files changed, 352 insertions(+), 31 deletions(-) create mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py diff --git a/vllm/config.py b/vllm/config.py index 37d062f7eb079..fce8011be4015 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2171,13 +2171,14 @@ def from_cli(cls, cli_value: str) -> "KVTransferConfig": return KVTransferConfig.model_validate_json(cli_value) def model_post_init(self, __context: Any) -> None: + supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] if all([ - self.kv_connector is not None, - self.kv_connector != "PyNcclConnector" + self.kv_connector is not None, self.kv_connector + not in supported_kv_connector ]): raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. " f"Supported connectors are " - f"`PyNcclConnector`.") + f"{supported_kv_connector}.") if self.kv_role is not None and self.kv_role not in [ "kv_producer", "kv_consumer", "kv_both" diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 015f892cec933..3e2bb436d24b5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -11,7 +11,8 @@ class KVConnectorFactory: @staticmethod def create_connector(rank: int, local_rank: int, config: "VllmConfig") -> KVConnectorBase: - if config.kv_transfer_config.kv_connector == 'PyNcclConnector': + supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] + if config.kv_transfer_config.kv_connector in supported_kv_connector: from .simple_connector import SimpleConnector return SimpleConnector(rank, local_rank, config) else: diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index bf4f40ca94e29..4ace03ff1184e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -2,7 +2,8 @@ Simple KV Cache Connector for Distributed Machine Learning Inference The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache -producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe. +producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or +MooncakePipe. But the logic can be extended to support other pipe and lookup buffer. """ @@ -15,7 +16,6 @@ from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import ( SimpleBuffer) -from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe from vllm.logger import init_logger from vllm.sequence import IntermediateTensors @@ -36,7 +36,27 @@ def __init__( self.config = config.kv_transfer_config - logger.info("Initializing PyNcclConfig under kv_transfer_config %s", + if self.config.kv_connector == "PyNcclConnector": + from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import ( + PyNcclPipe) + logger.info( + "Initializing PyNcclConfig under kv_transfer_config %s", + self.config) + elif self.config.kv_connector == "MooncakeConnector": + # Check if MOONCAKE_CONFIG_PATH is set + import os + use_mooncake_distributed_pipe = os.getenv( + 'MOONCAKE_CONFIG_PATH') is not None + + if not use_mooncake_distributed_pipe: + raise ValueError( + "To use MooncakeConnector, you need to pass the ENV: " + "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.") + else: + from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import ( # noqa: E501 + MooncakePipe) + logger.info( + "Initializing MooncakeConfig under kv_transfer_config %s", self.config) self.lookup_buffer_size = self.config.kv_buffer_size @@ -44,6 +64,11 @@ def __init__( self.producer_buffer: Optional[SimpleBuffer] = None self.consumer_buffer: Optional[SimpleBuffer] = None + self.producer_data_pipe: Union[PyNcclPipe, MooncakePipe] + self.consumer_data_pipe: Union[PyNcclPipe, MooncakePipe] + self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe] + self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe] + # 2 pipes for every rank in the world port_offset_base = 2 * rank @@ -51,17 +76,26 @@ def __init__( # and the decode vLLM only uses recv pipe if self.config.is_kv_producer: - self.producer_data_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base, - ) - self.producer_signal_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base + 1, - device="cpu", - ) + if self.config.kv_connector == "PyNcclConnector": + self.producer_data_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base, + ) + self.producer_signal_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base + 1, + device="cpu", + ) + elif self.config.kv_connector == "MooncakeConnector": + self.producer_data_pipe = MooncakePipe( + local_rank=local_rank, + config=self.config, + ) + # We only need to initialize MooncakePipe once + self.producer_signal_pipe = self.producer_data_pipe + self.producer_buffer = SimpleBuffer(self.producer_signal_pipe, self.producer_data_pipe, self.config.kv_buffer_size) @@ -70,17 +104,25 @@ def __init__( # the current vLLM instance is KV consumer, so it needs to connect # its recv pipe to the send pipe of KV producder - self.consumer_data_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base, - ) - self.consumer_signal_pipe = PyNcclPipe( - local_rank=local_rank, - config=self.config, - port_offset=port_offset_base + 1, - device="cpu", - ) + if self.config.kv_connector == "PyNcclConnector": + self.consumer_data_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base, + ) + self.consumer_signal_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base + 1, + device="cpu", + ) + elif self.config.kv_connector == "MooncakeConnector": + self.consumer_data_pipe = MooncakePipe( + local_rank=local_rank, + config=self.config, + ) + self.consumer_signal_pipe = self.consumer_data_pipe + self.consumer_buffer = SimpleBuffer( self.consumer_signal_pipe, self.consumer_data_pipe, @@ -260,6 +302,11 @@ def recv_kv_caches_and_hidden_states( def close(self): self.producer_data_pipe.close() - self.producer_signal_pipe.close() self.consumer_data_pipe.close() - self.consumer_signal_pipe.close() + if self.config.kv_connector == "PyNcclConnector": + self.producer_signal_pipe.close() + self.consumer_signal_pipe.close() + elif self.config.kv_connector == "MooncakeConnector": + # MooncakePipe reuses data_pipe for signal_pipe, so we only have to + # close the data_pipe. + pass diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py new file mode 100644 index 0000000000000..8e4358672b74d --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -0,0 +1,272 @@ +import json +import os +import pickle +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import zmq + +from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase +from vllm.logger import init_logger + +logger = init_logger(__name__) +NONE_INT = -150886311 + + +@dataclass +class MooncakeTransferEngineConfig: + prefill_url: str + decode_url: str + metadata_backend: Union[str, None] + metadata_server: str + protocol: str + device_name: str + + @staticmethod + def from_file(file_path: str) -> 'MooncakeTransferEngineConfig': + """Load the config from a JSON file.""" + with open(file_path) as fin: + config = json.load(fin) + return MooncakeTransferEngineConfig( + prefill_url=config.get("prefill_url"), + decode_url=config.get("decode_url"), + metadata_backend=config.get("metadata_backend", None), + metadata_server=config.get("metadata_server"), + protocol=config.get("protocol", "tcp"), + device_name=config.get("device_name", ""), + ) + + @staticmethod + def load_from_env() -> 'MooncakeTransferEngineConfig': + """Load config from a file specified in the environment variable.""" + config_file_path = os.getenv('MOONCAKE_CONFIG_PATH') + if config_file_path is None: + raise ValueError( + "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.") + return MooncakeTransferEngineConfig.from_file(config_file_path) + + +class MooncakeTransferEngine: + """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ.""" + + def __init__(self, kv_rank: int, local_rank: int): + try: + import mooncake_vllm_adaptor as mva + except ImportError as e: + raise ImportError( + "Please install mooncake by following the instructions at " + "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501 + "to run vLLM with MooncakeConnector.") from e + + self.engine = mva.mooncake_vllm_adaptor() + self.local_rank = local_rank + + try: + self.config = MooncakeTransferEngineConfig.load_from_env() + logger.info("Mooncake Configuration loaded successfully.") + except ValueError as e: + logger.error(e) + raise + except Exception as exc: + logger.error( + "An error occurred while loading the configuration: %s", exc) + raise + prefill_host, base_prefill_port = self.config.prefill_url.split(':') + decode_host, base_decode_port = self.config.decode_url.split(':') + + # Avoid ports conflict when running prefill and decode on the same node + if prefill_host == decode_host and \ + base_prefill_port == base_decode_port: + base_decode_port = str(int(base_decode_port) + 100) + + prefill_port = int(base_prefill_port) + self.local_rank + decode_port = int(base_decode_port) + self.local_rank + self.prefill_url = ':'.join([prefill_host, str(prefill_port)]) + self.decode_url = ':'.join([decode_host, str(decode_port)]) + + self.initialize(self.prefill_url if kv_rank == 0 else self.decode_url, + self.config.metadata_server, self.config.protocol, + self.config.device_name, self.config.metadata_backend) + + self.remote_url = (self.decode_url + if kv_rank == 0 else self.prefill_url) + + # Initialize ZeroMQ context and sockets + self.context = zmq.Context() # type: ignore[attr-defined] + self.sender_socket = self.context.socket(zmq.constants.PUSH) + self.receiver_socket = self.context.socket(zmq.constants.PULL) + self.sender_ack = self.context.socket(zmq.constants.PULL) + self.receiver_ack = self.context.socket(zmq.constants.PUSH) + + self.buffer_cleaner = ThreadPoolExecutor(max_workers=1) + self._setup_metadata_sockets(kv_rank, prefill_host, base_prefill_port, + decode_host, base_decode_port) + + def _setup_metadata_sockets(self, kv_rank: int, p_host: str, p_port: str, + d_host: str, d_port: str) -> None: + """Set up ZeroMQ sockets for sending and receiving data.""" + # Offsets < 8 are left for initialization in case tp and pp are enabled + p_rank_offset = int(p_port) + 8 + self.local_rank * 2 + d_rank_offset = int(d_port) + 8 + self.local_rank * 2 + if kv_rank == 0: + self.sender_socket.bind(f"tcp://*:{p_rank_offset + 1}") + self.receiver_socket.connect(f"tcp://{d_host}:{d_rank_offset + 1}") + self.sender_ack.connect(f"tcp://{d_host}:{d_rank_offset + 2}") + self.receiver_ack.bind(f"tcp://*:{p_rank_offset + 2}") + else: + self.receiver_socket.connect(f"tcp://{p_host}:{p_rank_offset + 1}") + self.sender_socket.bind(f"tcp://*:{d_rank_offset + 1}") + self.receiver_ack.bind(f"tcp://*:{d_rank_offset + 2}") + self.sender_ack.connect(f"tcp://{p_host}:{p_rank_offset + 2}") + + def initialize(self, local_hostname: str, metadata_server: str, + protocol: str, device_name: str, + metadata_backend: Union[str, None]) -> None: + """Initialize the mooncake instance.""" + if metadata_backend is None: + self.engine.initialize(local_hostname, metadata_server, protocol, + device_name) + else: + supported_backend = ["etcd", "redis"] + metadata_backend = metadata_backend.lower() + if metadata_backend not in supported_backend: + raise ValueError( + "Mooncake Configuration error. `metadata_backend`" + f"should be one of {supported_backend}.") + + self.engine.initializeExt(local_hostname, metadata_server, + protocol, device_name, metadata_backend) + + def allocate_managed_buffer(self, length: int) -> int: + """Allocate a managed buffer of the specified length.""" + ret = self.engine.allocateManagedBuffer(length) + if ret <= 0: + logger.error("Allocation Return Error") + raise Exception("Allocation Return Error") + return ret + + def free_managed_buffer(self, buffer: int, length: int) -> int: + """Free a previously allocated managed buffer.""" + return self.engine.freeManagedBuffer(buffer, length) + + def transfer_sync(self, buffer: int, peer_buffer_address: int, + length: int) -> int: + """Synchronously transfer data to the specified address.""" + ret = self.engine.transferSync(self.remote_url, buffer, + peer_buffer_address, length) + if ret < 0: + logger.error("Transfer Return Error") + raise Exception("Transfer Return Error") + return ret + + def write_bytes_to_buffer(self, buffer: int, user_data: bytes, + length: int) -> int: + """Write bytes to the allocated buffer.""" + return self.engine.writeBytesToBuffer(buffer, user_data, length) + + def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes: + """Read bytes from the allocated buffer.""" + return self.engine.readBytesFromBuffer(buffer, length) + + def wait_for_ack(self, src_ptr: int, length: int) -> None: + """Asynchronously wait for ACK from the receiver.""" + ack = self.sender_ack.recv_pyobj() + if ack != b'ACK': + logger.error("Failed to receive ACK from the receiver") + + self.free_managed_buffer(src_ptr, length) + + def send_bytes(self, user_data: bytes) -> None: + """Send bytes to the remote process.""" + length = len(user_data) + src_ptr = self.allocate_managed_buffer(length) + self.write_bytes_to_buffer(src_ptr, user_data, length) + self.sender_socket.send_pyobj((src_ptr, length)) + self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length) + + def recv_bytes(self) -> bytes: + """Receive bytes from the remote process.""" + src_ptr, length = self.receiver_socket.recv_pyobj() + dst_ptr = self.allocate_managed_buffer(length) + self.transfer_sync(dst_ptr, src_ptr, length) + ret = self.read_bytes_from_buffer(dst_ptr, length) + + # Buffer cleanup + self.receiver_ack.send_pyobj(b'ACK') + self.free_managed_buffer(dst_ptr, length) + + return ret + + +class MooncakePipe(KVPipeBase): + """MooncakeTransferEngine based Pipe implementation.""" + + def __init__(self, + local_rank: int, + config: KVTransferConfig, + device: Optional[str] = None): + """Initialize the mooncake pipe and set related parameters.""" + self.config = config + self.local_rank = local_rank + self.kv_rank = self.config.kv_rank + if device is None: + self.device = self._select_device(self.config.kv_buffer_device) + else: + self.device = self._select_device(device) + + self.transfer_engine = MooncakeTransferEngine(self.kv_rank, + self.local_rank) + self.transport_thread: Optional[ThreadPoolExecutor] = None + self.none_tensor = torch.tensor([NONE_INT], device=self.device) + + def _select_device(self, device: str) -> torch.device: + """Select available device (CUDA or CPU).""" + logger.info("Selecting device: %s", device) + if device == "cuda": + return torch.device(f"cuda:{self.local_rank}") + else: + return torch.device("cpu") + + def tensor_hash(self, tensor: torch.Tensor) -> int: + """Calculate the hash value of the tensor.""" + return hash(tensor.data_ptr()) + + def _send_impl(self, tensor: torch.Tensor) -> None: + """Implement the tensor sending logic.""" + value_bytes = pickle.dumps(tensor) + self.transfer_engine.send_bytes(value_bytes) + + def _recv_impl(self) -> torch.Tensor: + """Implement the tensor receiving logic.""" + data = self.transfer_engine.recv_bytes() + return pickle.loads(data) + + def send_tensor(self, tensor: Optional[torch.Tensor]) -> None: + """Send tensor to the target process.""" + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + tensor = tensor if tensor is not None else self.none_tensor + assert (len(tensor.shape) > 0) + self.transport_thread.submit(self._send_impl, tensor) + + def recv_tensor(self) -> Optional[torch.Tensor]: + """Receive tensor from other processes.""" + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + tensor = self.transport_thread.submit(self._recv_impl).result() + if tensor.numel() == 1 and tensor.item() == NONE_INT: + return None + else: + return tensor + + def close(self) -> None: + """Cleanup logic when closing the pipe.""" + self.transfer_engine.sender_socket.close() + self.transfer_engine.receiver_socket.close() + self.transfer_engine.sender_ack.close() + self.transfer_engine.receiver_ack.close() + self.transfer_engine.context.term() # Terminate the ZMQ context + logger.info("Closed the transfer engine and cleaned up resources.") From 25ebed2f8ca6d747d63f2be9ede023c561851ac8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 15 Dec 2024 13:33:00 -0800 Subject: [PATCH 36/56] [V1][Minor] Cache np arange to reduce input preparation overhead (#11214) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index abcd4b007a326..67166fb05085c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -118,6 +118,12 @@ def __init__( dtype=self.dtype, device=self.device) + # OPTIMIZATION: Cache the tensors rather than creating them every step. + self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len), + dtype=np.int32) + # NOTE(woosuk): These tensors are "stateless", i.e., they are literally + # a faster version of creating a new tensor every time. Thus, we should + # not make any assumptions about the values in these tensors. self.input_ids_cpu = torch.zeros(self.max_num_tokens, dtype=torch.int32, device="cpu", @@ -269,11 +275,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Get request indices. # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] - req_indices = np.repeat(np.arange(num_reqs), num_scheduled_tokens) + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) # Get batched arange. # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + arange = np.concatenate( + [self.arange_np[:n] for n in num_scheduled_tokens]) # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] From da6f40924609e084ced486cae5b4ddf97133acd9 Mon Sep 17 00:00:00 2001 From: AlexHe99 Date: Mon, 16 Dec 2024 08:33:58 +0800 Subject: [PATCH 37/56] Update deploying_with_k8s.rst (#10922) --- docs/source/serving/deploying_with_k8s.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst index 7dc076dc709df..cc3606f0df851 100644 --- a/docs/source/serving/deploying_with_k8s.rst +++ b/docs/source/serving/deploying_with_k8s.rst @@ -162,7 +162,7 @@ To test the deployment, run the following ``curl`` command: curl http://mistral-7b.default.svc.cluster.local/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "facebook/opt-125m", + "model": "mistralai/Mistral-7B-Instruct-v0.3", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 @@ -172,4 +172,4 @@ If the service is correctly deployed, you should receive a response from the vLL Conclusion ---------- -Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. \ No newline at end of file +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. From 69ba344de8683ec4d3d42d11ae4e147a2a302da8 Mon Sep 17 00:00:00 2001 From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com> Date: Sun, 15 Dec 2024 16:38:40 -0800 Subject: [PATCH 38/56] [Bugfix] Fix block size validation (#10938) --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5a73c6ee02e0c..0aa367a173b6c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -425,7 +425,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32, 64, 128], + choices=[8, 16, 32], help='Token block size for contiguous chunks of ' 'tokens. This is ignored on neuron devices and ' 'set to max-model-len') From 17138af7c45eba3aba3e9b84a3852b4ba81e460f Mon Sep 17 00:00:00 2001 From: yansh97 Date: Mon, 16 Dec 2024 16:15:40 +0800 Subject: [PATCH 39/56] [Bugfix] Fix the default value for temperature in ChatCompletionRequest (#11219) --- vllm/entrypoints/openai/protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index dfb7c977dbd43..6ed7c2e9dcd6b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -211,7 +211,7 @@ class ChatCompletionRequest(OpenAIBaseModel): stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None - temperature: Optional[float] = 0.7 + temperature: Optional[float] = 1.0 top_p: Optional[float] = 1.0 tools: Optional[List[ChatCompletionToolsParam]] = None tool_choice: Optional[Union[Literal["none"], Literal["auto"], From b3b1526f03906c935e6ef80a2cdc971a65fdf7e2 Mon Sep 17 00:00:00 2001 From: cennn <61925104+cennn@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:20:49 +0800 Subject: [PATCH 40/56] WIP: [CI/Build] simplify Dockerfile build for ARM64 / GH200 (#11212) Signed-off-by: drikster80 Co-authored-by: drikster80 --- Dockerfile | 40 +++++++++++++++---- docs/source/serving/deploying_with_docker.rst | 26 ++++++++++++ requirements-build.txt | 2 +- requirements-cuda-arm64.txt | 3 ++ requirements-cuda.txt | 4 +- 5 files changed, 64 insertions(+), 11 deletions(-) create mode 100644 requirements-cuda-arm64.txt diff --git a/Dockerfile b/Dockerfile index c1b6e1bbfe354..123703848749c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 +ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies @@ -46,9 +47,14 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt +COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} #################### WHEEL BUILD IMAGE #################### FROM base AS build +ARG TARGETPLATFORM # install build dependencies COPY requirements-build.txt requirements-build.txt @@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi + COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ @@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt - #################### DEV IMAGE #################### + #################### vLLM installation IMAGE #################### # image with vLLM installed FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base @@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive +ARG TARGETPLATFORM + +COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment @@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# install vllm wheel first, so that torch etc will be installed +# Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose RUN --mount=type=cache,target=/root/.cache/pip \ - . /etc/environment && \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + pip uninstall -y torch && \ + python3 -m pip install -r requirements-cuda-arm64.txt; \ + fi + +RUN --mount=type=cache,target=/root/.cache/pip \ +. /etc/environment && \ +if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ + python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ +fi COPY examples examples #################### vLLM installation IMAGE #################### - #################### TEST IMAGE #################### # image to run unit testing suite # note that this uses vllm installed by `pip` @@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 RUN mkdir test_docs RUN mv docs test_docs/ RUN mv vllm test_docs/ - #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### @@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10 - + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \ + else \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \ + fi ENV VLLM_USAGE_SOURCE production-docker-image ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 14d94b09e9b9c..11a9f12fd17cd 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile =61 setuptools-scm>=8 -torch==2.5.1 +torch==2.5.1; platform_machine != 'aarch64' wheel jinja2 diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt new file mode 100644 index 0000000000000..bbcb5cb7012ce --- /dev/null +++ b/requirements-cuda-arm64.txt @@ -0,0 +1,3 @@ +--index-url https://download.pytorch.org/whl/nightly/cu124 +torchvision==0.22.0.dev20241215; platform_machine == 'aarch64' +torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64' diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 058ab7c1ee9df..5d4dee8c7129a 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package -torch == 2.5.1 +torch == 2.5.1; platform_machine != 'aarch64' # These must be updated alongside torch -torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 From bddbbcb132429084ede62855bcd6a1023a3645c1 Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Mon, 16 Dec 2024 11:56:19 +0200 Subject: [PATCH 41/56] [Model] Support Cohere2ForCausalLM (Cohere R7B) (#11203) --- docs/source/models/supported_models.rst | 4 ++-- tests/models/registry.py | 2 ++ tests/models/test_initialization.py | 4 ++++ vllm/model_executor/models/commandr.py | 19 +++++++++++++++++-- vllm/model_executor/models/registry.py | 1 + 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index cae4a88de1638..3bef3f3226062 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -118,9 +118,9 @@ Text Generation (``--task generate``) - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - ✅︎ - ✅︎ - * - :code:`CohereForCausalLM` + * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM` - Command-R - - :code:`CohereForAI/c4ai-command-r-v01`, etc. + - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc. - ✅︎ - ✅︎ * - :code:`DbrxForCausalLM` diff --git a/tests/models/registry.py b/tests/models/registry.py index 6a8b1742ceae3..fac8c4b2e9b19 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -53,6 +53,8 @@ class _HfExamplesInfo: # ChatGLMModel supports multimodal "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01", trust_remote_code=True), + "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501 + trust_remote_code=True), "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct", trust_remote_code=True), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b728f2744fca..a4eea7f035c91 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,6 +1,7 @@ from unittest.mock import patch import pytest +import transformers from transformers import PretrainedConfig from vllm import LLM @@ -11,6 +12,9 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + if (model_arch == "Cohere2ForCausalLM" + and transformers.__version__ < "4.48.0"): + pytest.skip(reason="Model introduced in HF >= 4.48.0") if not model_info.is_available_online: pytest.skip("Model is not available online") diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 85e24ca660686..c846e42f1b0c3 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -48,7 +48,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -171,12 +171,26 @@ def __init__( rope_scaling=self.rope_scaling, is_neox_style=False, ) + + sliding_window = getattr(config, "sliding_window", None) + # Model v2 has sliding windows, v1 does not + self.v1 = sliding_window is None + + layer_idx = extract_layer_index(prefix) + layer_has_sliding_window = ( + getattr(config, "sliding_window_pattern", False) + and (layer_idx + 1) % self.config.sliding_window_pattern != 0) + + self.sliding_window = (sliding_window + if layer_has_sliding_window else None) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + per_layer_sliding_window=self.sliding_window, prefix=f"{prefix}.attn") if self.use_qk_norm: self.q_norm = LayerNorm(param_shape=(self.num_heads, @@ -206,7 +220,8 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.use_qk_norm: q, k = self._apply_qk_norm(q, k) - q, k = self.rotary_emb(positions, q, k) + if self.v1 or self.sliding_window: + q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 4e77746f312e3..68a2467a813a1 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -41,6 +41,7 @@ "BloomForCausalLM": ("bloom", "BloomForCausalLM"), # ChatGLMModel supports multimodal "CohereForCausalLM": ("commandr", "CohereForCausalLM"), + "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), From d927dbcd889fb2476cb61ea477ff51e5dd9e1ae3 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 16 Dec 2024 18:09:53 +0800 Subject: [PATCH 42/56] [Model] Refactor Ultravox to use merged input processor (#11198) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- examples/offline_inference_audio_language.py | 10 +- tests/distributed/test_pipeline_parallel.py | 2 +- tests/entrypoints/openai/test_audio.py | 1 + .../audio_language/test_ultravox.py | 5 +- vllm/entrypoints/chat_utils.py | 2 +- vllm/model_executor/models/ultravox.py | 244 ++++++++---------- vllm/multimodal/processing.py | 19 +- 7 files changed, 129 insertions(+), 154 deletions(-) diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 050b791b62adb..68b786961b14a 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -25,16 +25,16 @@ def run_ultravox(question: str, audio_count: int): tokenizer = AutoTokenizer.from_pretrained(model_name) messages = [{ - 'role': - 'user', - 'content': - "<|reserved_special_token_0|>\n" * audio_count + question + 'role': 'user', + 'content': "<|audio|>\n" * audio_count + question }] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count}) + llm = LLM(model=model_name, + trust_remote_code=True, + limit_mm_per_prompt={"audio": audio_count}) stop_token_ids = None return llm, prompt, stop_token_ids diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 85d408efafe96..ddbf40f089407 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -214,7 +214,7 @@ def iter_params(self, model_name: str): "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), - "fixie-ai/ultravox-v0_3": PPTestSettings.fast(), + "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True), # [Encoder-decoder] # TODO: Implement PP # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a74109e2f5120..b579dcbb5c402 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -25,6 +25,7 @@ def server(): "--max-num-seqs", "5", "--enforce-eager", + "--trust-remote-code", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index e100c6b9bb906..c548cfdf53414 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -16,7 +16,7 @@ AudioTuple = Tuple[np.ndarray, int] -VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" +VLLM_PLACEHOLDER = "<|audio|>" HF_PLACEHOLDER = "<|audio|>" CHUNKED_PREFILL_KWARGS = { @@ -46,7 +46,8 @@ def audio(request): def server(request, audio_assets): args = [ "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", - f"--limit-mm-per-prompt=audio={len(audio_assets)}" + f"--limit-mm-per-prompt=audio={len(audio_assets)}", + "--trust-remote-code" ] + [ f"--{key.replace('_','-')}={value}" for key, value in request.param.items() diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c2054dcbfce0e..aaa5cd759366a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -418,7 +418,7 @@ def _placeholder_str(self, modality: ModalityStr, raise TypeError(f"Unknown {modality} model type: {model_type}") elif modality == "audio": if model_type == "ultravox": - return "<|reserved_special_token_0|>" + return "<|audio|>" if model_type == "qwen2_audio": return (f"Audio {current_count}: " f"<|audio_bos|><|AUDIO|><|audio_eos|>") diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index ea1e5401d42c0..ebaa8a4c4f38a 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,41 +3,39 @@ import math from functools import cached_property, lru_cache -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union, cast) +from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import numpy as np import torch import torch.utils.checkpoint from torch import nn from torch.nn import functional as F +from transformers import BatchFeature from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper.modeling_whisper import WhisperEncoder from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, - NestedTensors) -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataDict, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings_from_map) -_AUDIO_PLACEHOLDER_TOKEN = 128002 _AUDIO_TOKENS_PER_SECOND = 6.25 @@ -72,64 +70,18 @@ def get_ultravox_max_audio_tokens(ctx: InputContext): return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) -def dummy_seq_data_for_ultravox( - ctx: InputContext, - seq_len: int, - audio_count: int, -): - audio_length = min(get_ultravox_max_audio_tokens(ctx), - seq_len // audio_count) +class UltravoxMultiModalProcessor(BaseMultiModalProcessor): - return SequenceData.from_prompt_token_counts( - (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count), - (0, seq_len - audio_length * audio_count)), { - "audio": - consecutive_placeholder_ranges(num_items=audio_count, - item_size=audio_length) - } - - -def dummy_audio_for_ultravox( - ctx: InputContext, - audio_count: int, -): - feature_extractor = whisper_feature_extractor(ctx) - audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1) - return {"audio": [audio_and_sr] * audio_count} - - -def dummy_data_for_ultravox( - ctx: InputContext, - seq_len: int, - mm_counts: Mapping[str, int], -): - audio_count = mm_counts["audio"] - seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count) - mm_dict = dummy_audio_for_ultravox(ctx, audio_count) - - return DummyData(seq_data, mm_dict, ranges) - - -def input_mapper_for_ultravox(ctx: InputContext, data: object): - if not isinstance(data, list): - data = [data] - - if len(data) == 0: - return MultiModalKwargs() - - # If the audio inputs are embeddings, no need for preprocessing - if is_list_of(data, torch.Tensor, check="all"): - return MultiModalKwargs({"audio_embeds": data}) - - audio_features = [] - for audio_input in data: - if not isinstance(audio_input, tuple): - raise NotImplementedError( - f"Unsupported data type: {type(audio_input)}") - - (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input) - feature_extractor = whisper_feature_extractor(ctx) + def _get_feature_extractor(self) -> WhisperFeatureExtractor: + return self._get_hf_processor().audio_processor.feature_extractor + def _resample_audio( + self, + audio: np.ndarray, + sr: int, + ) -> Dict[str, Union[np.ndarray, int]]: + # resample audio to the model's sampling rate + feature_extractor = self._get_feature_extractor() if sr != feature_extractor.sampling_rate: try: import librosa @@ -140,78 +92,92 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): orig_sr=sr, target_sr=feature_extractor.sampling_rate) sr = feature_extractor.sampling_rate + return {"audio": audio, "sampling_rate": sr} - minimum_audio_length = feature_extractor.n_fft // 2 + 1 - if len(audio) < minimum_audio_length: - # Not enough audio; pad it. - audio = np.pad(audio, (0, minimum_audio_length - len(audio))) - - single_audio_features = feature_extractor( - audio, sampling_rate=sr, padding="longest", - return_tensors="pt")["input_features"] - - # Remove the batch dimension because we're wrapping it in a list. - audio_features.append(single_audio_features.squeeze(0)) - - return MultiModalKwargs({"audio_features": audio_features}) - - -def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "audio" not in multi_modal_data: - return inputs + def _apply_hf_processor( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> BatchFeature: + if not mm_data or not mm_data.get("audio", None): + return super()._apply_hf_processor(prompt, mm_data, + mm_processor_kwargs) + + audio_data = mm_data["audio"] + if not isinstance(audio_data, list): + audio_data = [audio_data] + + # Ultravox processor doesn't support multiple inputs, + # therefore we need to input text and audio one by one + tokenizer = self._get_tokenizer() + audio_features, audio_token_len = [], [] + processed_inputs = {} + for audio, sr in audio_data: + data = self._resample_audio(audio, sr) + processed_inputs = super()._apply_hf_processor( + prompt, data, mm_processor_kwargs) + prompt = tokenizer.decode(processed_inputs["input_ids"][0], + skip_special_tokens=False) + audio_features.append( + processed_inputs.pop("audio_values").squeeze(0)) + audio_token_len.append( + processed_inputs.pop("audio_token_len").item()) + + return dict( + **processed_inputs, + audio_features=audio_features, + audio_token_len=audio_token_len, + ) - if "multi_modal_placeholders" in inputs and "audio" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs + def _get_processor_data( + self, + mm_data: MultiModalDataDict, + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + # Ultravox uses "audio" instead of "audios" as calling keyword + processor_data, passthrough_data = super()._get_processor_data(mm_data) + if "audios" in processor_data: + processor_data["audio"] = processor_data.pop("audios") + return processor_data, passthrough_data + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_inputs: BatchFeature, + mm_processor_kwargs: Mapping[str, object], + ) -> list[PromptReplacement]: + hf_processor = self._get_hf_processor() + placeholder = hf_processor.audio_token_replacement + + def get_replacement_ultravox(item_idx: int): + audio_token_len = hf_inputs["audio_token_len"][item_idx] + return placeholder * audio_token_len + + return [ + PromptReplacement( + modality="audio", + target="<|audio|>", + replacement=get_replacement_ultravox, + ) + ] - feature_extractor = whisper_feature_extractor(ctx) - audios = multi_modal_data["audio"] - if not isinstance(audios, list): - audios = [audios] - - audio_token_counts = [] - for audio in audios: - if isinstance(audio, torch.Tensor): - audio_num_tokens = audio.shape[1] - audio_token_counts.append(audio_num_tokens) - else: - audio_data, sample_rate = audio - audio_length = audio_data.shape[0] - if sample_rate != feature_extractor.sampling_rate: - # Account for resampling. - adjustment = feature_extractor.sampling_rate / sample_rate - audio_length = math.ceil(adjustment * audio_length) - - feature_extractor_output_length = math.ceil( - (audio_length - (feature_extractor.hop_length - 1)) / - feature_extractor.hop_length) - - uv_config = ctx.get_hf_config(UltravoxConfig) - audio_num_tokens = min( - max( - 1, - math.ceil(feature_extractor_output_length / - (uv_config.stack_factor * 2))), - get_ultravox_max_audio_tokens(ctx)) - audio_token_counts.append(audio_num_tokens) - - tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN, - repeat_count=audio_token_counts, - ) - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"audio": ranges}) + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + + audio_count = mm_counts["audio"] + audio = np.zeros(audio_len) + data = {"audio": [(audio, sampling_rate)] * audio_count} + + return ProcessorInputs( + prompt_text="<|audio|>" * audio_count, + mm_data=data, + mm_processor_kwargs={}, + ) class StackAudioFrames(nn.Module): @@ -332,11 +298,9 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_ultravox) @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( "audio", get_ultravox_max_audio_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_ultravox) -@INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox) +@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index ce6bec1d49aac..339e193eefe20 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -594,14 +594,10 @@ def _find_placeholders( return list( iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) - def _apply_hf_processor( + def _get_processor_data( self, - prompt: str, mm_data: MultiModalDataDict, - mm_processor_kwargs: Mapping[str, object], ) -> BatchFeature: - hf_processor = self._get_hf_processor(**mm_processor_kwargs) - processor_data = dict[str, Any]() passthrough_data = dict[str, Any]() for k, v in mm_data.items(): @@ -619,6 +615,19 @@ def _apply_hf_processor( processor_data[f"{k}s"] = v else: processor_data[k] = v + return processor_data, passthrough_data + + def _apply_hf_processor( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> BatchFeature: + # some mm_processor_kwargs may be used in processor initialization + # instead of processor call + hf_processor = self._get_hf_processor(**mm_processor_kwargs) + + processor_data, passthrough_data = self._get_processor_data(mm_data) assert callable(hf_processor) mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs( From 2ca830dbaa1a7c30b8ff4d7c860c63f87dc18be3 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 16 Dec 2024 19:23:33 +0800 Subject: [PATCH 43/56] [Doc] Reorder vision language examples in alphabet order (#11228) Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference_vision_language.py | 486 +++++++++--------- ...e_inference_vision_language_multi_image.py | 288 +++++------ 2 files changed, 387 insertions(+), 387 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 7bc43242b717e..6d0495fdd4054 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -19,6 +19,159 @@ # Unless specified, these settings have been tested to work on a single L4. +# Aria +def run_aria(question: str, modality: str): + assert modality == "image" + model_name = "rhymes-ai/Aria" + + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16", + mm_cache_preprocessor=args.mm_cache_preprocessor) + + prompt = (f"<|im_start|>user\n<|img|>\n{question}" + "<|im_end|>\n<|im_start|>assistant\n") + + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return llm, prompt, stop_token_ids + + +# BLIP-2 +def run_blip2(question: str, modality: str): + assert modality == "image" + + # BLIP-2 prompt format is inaccurate on HuggingFace model repository. + # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa + prompt = f"Question: {question} Answer:" + llm = LLM(model="Salesforce/blip2-opt-2.7b", + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Chameleon +def run_chameleon(question: str, modality: str): + assert modality == "image" + + prompt = f"{question}" + llm = LLM(model="facebook/chameleon-7b", + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Fuyu +def run_fuyu(question: str, modality: str): + assert modality == "image" + + prompt = f"{question}\n" + llm = LLM(model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2, + mm_cache_preprocessor=args.mm_cache_preprocessor) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# GLM-4v +def run_glm4v(question: str, modality: str): + assert modality == "image" + model_name = "THUDM/glm-4v-9b" + + llm = LLM(model=model_name, + max_model_len=2048, + max_num_seqs=2, + trust_remote_code=True, + enforce_eager=True, + mm_cache_preprocessor=args.mm_cache_preprocessor) + prompt = question + stop_token_ids = [151329, 151336, 151338] + return llm, prompt, stop_token_ids + + +# H2OVL-Mississippi +def run_h2ovl(question: str, modality: str): + assert modality == "image" + + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + mm_cache_preprocessor=args.mm_cache_preprocessor, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + return llm, prompt, stop_token_ids + + +# Idefics3-8B-Llama3 +def run_idefics3(question: str, modality: str): + assert modality == "image" + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 3 * 364 + }, + }, + mm_cache_preprocessor=args.mm_cache_preprocessor, + ) + prompt = ( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# InternVL +def run_internvl(question: str, modality: str): + assert modality == "image" + + model_name = "OpenGVLab/InternVL2-2B" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return llm, prompt, stop_token_ids + + # LLaVA-1.5 def run_llava(question: str, modality: str): assert modality == "image" @@ -75,83 +228,20 @@ def run_llava_onevision(question: str, modality: str): return llm, prompt, stop_token_ids -# Fuyu -def run_fuyu(question: str, modality: str): - assert modality == "image" - - prompt = f"{question}\n" - llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2, - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# Phi-3-Vision -def run_phi3v(question: str, modality: str): +# Mantis +def run_mantis(question: str, modality: str): assert modality == "image" - prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 + prompt = llama3_template.format(f"{question}\n") - # num_crops is an override kwarg to the multimodal image processor; - # For some models, e.g., Phi-3.5-vision-instruct, it is recommended - # to use 16 for single frame scenarios, and 4 for multi-frame. - # - # Generally speaking, a larger value for num_crops results in more - # tokens per image instance, because it may scale the image more in - # the image preprocessing. Some references in the model docs and the - # formula for image tokens after the preprocessing - # transform can be found below. - # - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, + model="TIGER-Lab/Mantis-8B-siglip-llama3", max_model_len=4096, - max_num_seqs=2, - # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={"num_crops": 16}, + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# PaliGemma -def run_paligemma(question: str, modality: str): - assert modality == "image" - - # PaliGemma has special prompt format for VQA - prompt = "caption en" - llm = LLM(model="google/paligemma-3b-mix-224", - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# PaliGemma 2 -def run_paligemma2(question: str, modality: str): - assert modality == "image" - - # PaliGemma 2 has special prompt format for VQA - prompt = "caption en" - llm = LLM(model="google/paligemma2-3b-ft-docci-448", - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# Chameleon -def run_chameleon(question: str, modality: str): - assert modality == "image" - - prompt = f"{question}" - llm = LLM(model="facebook/chameleon-7b", - max_model_len=4096, - mm_cache_preprocessor=args.mm_cache_preprocessor) - stop_token_ids = None + stop_token_ids = [128009] return llm, prompt, stop_token_ids @@ -199,58 +289,45 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids -# H2OVL-Mississippi -def run_h2ovl(question: str, modality: str): +# LLama 3.2 +def run_mllama(question: str, modality: str): assert modality == "image" - model_name = "h2oai/h2ovl-mississippi-2b" + model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" + # Note: The default setting of max_num_seqs (256) and + # max_model_len (131072) for this model may cause OOM. + # You may lower either to run this example on lower-end GPUs. + + # The configuration below has been confirmed to launch on a single L40 GPU. llm = LLM( model=model_name, - trust_remote_code=True, - max_model_len=8192, + max_model_len=4096, + max_num_seqs=16, + enforce_eager=True, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [{'role': 'user', 'content': f"\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - - # Stop tokens for H2OVL-Mississippi - # https://huggingface.co/h2oai/h2ovl-mississippi-2b - stop_token_ids = [tokenizer.eos_token_id] + prompt = f"<|image|><|begin_of_text|>{question}" + stop_token_ids = None return llm, prompt, stop_token_ids -# InternVL -def run_internvl(question: str, modality: str): +# Molmo +def run_molmo(question, modality): assert modality == "image" - model_name = "OpenGVLab/InternVL2-2B" + model_name = "allenai/Molmo-7B-D-0924" llm = LLM( model=model_name, trust_remote_code=True, - max_model_len=4096, + dtype="bfloat16", mm_cache_preprocessor=args.mm_cache_preprocessor, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [{'role': 'user', 'content': f"\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - - # Stop tokens for InternVL - # models variants may have different stop tokens - # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + prompt = question + stop_token_ids = None return llm, prompt, stop_token_ids @@ -279,58 +356,57 @@ def run_nvlm_d(question: str, modality: str): return llm, prompt, stop_token_ids -# BLIP-2 -def run_blip2(question: str, modality: str): +# PaliGemma +def run_paligemma(question: str, modality: str): assert modality == "image" - # BLIP-2 prompt format is inaccurate on HuggingFace model repository. - # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa - prompt = f"Question: {question} Answer:" - llm = LLM(model="Salesforce/blip2-opt-2.7b", + # PaliGemma has special prompt format for VQA + prompt = "caption en" + llm = LLM(model="google/paligemma-3b-mix-224", mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids -# Qwen -def run_qwen_vl(question: str, modality: str): +# PaliGemma 2 +def run_paligemma2(question: str, modality: str): assert modality == "image" - llm = LLM( - model="Qwen/Qwen-VL", - trust_remote_code=True, - max_model_len=1024, - max_num_seqs=2, - mm_cache_preprocessor=args.mm_cache_preprocessor, - ) - - prompt = f"{question}Picture 1: \n" + # PaliGemma 2 has special prompt format for VQA + prompt = "caption en" + llm = LLM(model="google/paligemma2-3b-ft-docci-448", + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids -# Qwen2-VL -def run_qwen2_vl(question: str, modality: str): +# Phi-3-Vision +def run_phi3v(question: str, modality: str): assert modality == "image" - model_name = "Qwen/Qwen2-VL-7B-Instruct" + prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" + # num_crops is an override kwarg to the multimodal image processor; + # For some models, e.g., Phi-3.5-vision-instruct, it is recommended + # to use 16 for single frame scenarios, and 4 for multi-frame. + # + # Generally speaking, a larger value for num_crops results in more + # tokens per image instance, because it may scale the image more in + # the image preprocessing. Some references in the model docs and the + # formula for image tokens after the preprocessing + # transform can be found below. + # + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 llm = LLM( - model=model_name, + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, max_model_len=4096, - max_num_seqs=5, + max_num_seqs=2, # Note - mm_processor_kwargs can also be passed to generate/chat calls - mm_processor_kwargs={ - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - }, + mm_processor_kwargs={"num_crops": 16}, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - - prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") stop_token_ids = None return llm, prompt, stop_token_ids @@ -352,149 +428,73 @@ def run_pixtral_hf(question: str, modality: str): return llm, prompt, stop_token_ids -# LLama 3.2 -def run_mllama(question: str, modality: str): - assert modality == "image" - - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (131072) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # The configuration below has been confirmed to launch on a single L40 GPU. - llm = LLM( - model=model_name, - max_model_len=4096, - max_num_seqs=16, - enforce_eager=True, - mm_cache_preprocessor=args.mm_cache_preprocessor, - ) - - prompt = f"<|image|><|begin_of_text|>{question}" - stop_token_ids = None - return llm, prompt, stop_token_ids - - -# Molmo -def run_molmo(question, modality): +# Qwen +def run_qwen_vl(question: str, modality: str): assert modality == "image" - model_name = "allenai/Molmo-7B-D-0924" - llm = LLM( - model=model_name, + model="Qwen/Qwen-VL", trust_remote_code=True, - dtype="bfloat16", + max_model_len=1024, + max_num_seqs=2, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - prompt = question + prompt = f"{question}Picture 1: \n" stop_token_ids = None return llm, prompt, stop_token_ids -# GLM-4v -def run_glm4v(question: str, modality: str): +# Qwen2-VL +def run_qwen2_vl(question: str, modality: str): assert modality == "image" - model_name = "THUDM/glm-4v-9b" - llm = LLM(model=model_name, - max_model_len=2048, - max_num_seqs=2, - trust_remote_code=True, - enforce_eager=True, - mm_cache_preprocessor=args.mm_cache_preprocessor) - prompt = question - stop_token_ids = [151329, 151336, 151338] - return llm, prompt, stop_token_ids - - -# Idefics3-8B-Llama3 -def run_idefics3(question: str, modality: str): - assert modality == "image" - model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + model_name = "Qwen/Qwen2-VL-7B-Instruct" llm = LLM( model=model_name, - max_model_len=8192, - max_num_seqs=2, - enforce_eager=True, - # if you are running out of memory, you can reduce the "longest_edge". - # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + max_model_len=4096, + max_num_seqs=5, + # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={ - "size": { - "longest_edge": 3 * 364 - }, + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, }, mm_cache_preprocessor=args.mm_cache_preprocessor, ) - prompt = ( - f"<|begin_of_text|>User:{question}\nAssistant:" - ) - stop_token_ids = None - return llm, prompt, stop_token_ids - -# Aria -def run_aria(question: str, modality: str): - assert modality == "image" - model_name = "rhymes-ai/Aria" - - llm = LLM(model=model_name, - tokenizer_mode="slow", - trust_remote_code=True, - dtype="bfloat16", - mm_cache_preprocessor=args.mm_cache_preprocessor) - - prompt = (f"<|im_start|>user\n<|img|>\n{question}" - "<|im_end|>\n<|im_start|>assistant\n") - - stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return llm, prompt, stop_token_ids - - -# Mantis -def run_mantis(question: str, modality: str): - assert modality == "image" - - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 - prompt = llama3_template.format(f"{question}\n") - - llm = LLM( - model="TIGER-Lab/Mantis-8B-siglip-llama3", - max_model_len=4096, - hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, - mm_cache_preprocessor=args.mm_cache_preprocessor, - ) - stop_token_ids = [128009] + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None return llm, prompt, stop_token_ids model_example_map = { + "aria": run_aria, + "blip-2": run_blip2, + "chameleon": run_chameleon, + "fuyu": run_fuyu, + "glm4v": run_glm4v, + "h2ovl_chat": run_h2ovl, + "idefics3": run_idefics3, + "internvl_chat": run_internvl, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, "llava-onevision": run_llava_onevision, - "fuyu": run_fuyu, - "phi3_v": run_phi3v, - "paligemma": run_paligemma, - "paligemma2": run_paligemma2, - "chameleon": run_chameleon, + "mantis": run_mantis, "minicpmv": run_minicpmv, - "blip-2": run_blip2, - "h2ovl_chat": run_h2ovl, - "internvl_chat": run_internvl, + "mllama": run_mllama, + "molmo": run_molmo, "NVLM_D": run_nvlm_d, + "paligemma": run_paligemma, + "paligemma2": run_paligemma2, + "phi3_v": run_phi3v, + "pixtral_hf": run_pixtral_hf, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, - "pixtral_hf": run_pixtral_hf, - "mllama": run_mllama, - "molmo": run_molmo, - "glm4v": run_glm4v, - "idefics3": run_idefics3, - "aria": run_aria, - "mantis": run_mantis, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 928bbef54eab7..6af8d7768e75d 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -33,78 +33,23 @@ class ModelRequestData(NamedTuple): # Unless specified, these settings have been tested to work on a single L4. -def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: - model_name = "Qwen/Qwen-VL-Chat" - llm = LLM( - model=model_name, - trust_remote_code=True, - max_model_len=1024, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - placeholders = "".join(f"Picture {i}: \n" - for i, _ in enumerate(image_urls, start=1)) - - # This model does not have a chat_template attribute on its tokenizer, - # so we need to explicitly pass it. We use ChatML since it's used in the - # generation utils of the model: - # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265 - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - - # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating - chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 - - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True, - chat_template=chat_template) - - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], - chat_template=chat_template, - ) - - -def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: - # num_crops is an override kwarg to the multimodal image processor; - # For some models, e.g., Phi-3.5-vision-instruct, it is recommended - # to use 16 for single frame scenarios, and 4 for multi-frame. - # - # Generally speaking, a larger value for num_crops results in more - # tokens per image instance, because it may scale the image more in - # the image preprocessing. Some references in the model docs and the - # formula for image tokens after the preprocessing - # transform can be found below. - # - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally - # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=2, - limit_mm_per_prompt={"image": len(image_urls)}, - mm_processor_kwargs={"num_crops": 4}, - ) - placeholders = "\n".join(f"<|image_{i}|>" - for i, _ in enumerate(image_urls, start=1)) - prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" - stop_token_ids = None - +def load_aria(question, image_urls: List[str]) -> ModelRequestData: + model_name = "rhymes-ai/Aria" + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16", + limit_mm_per_prompt={"image": len(image_urls)}) + placeholders = "<|img|>\n" * len(image_urls) + prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( llm=llm, prompt=prompt, stop_token_ids=stop_token_ids, image_data=[fetch_image(url) for url in image_urls], - chat_template=None, - ) + chat_template=None) def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: @@ -141,6 +86,37 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 2 * 364 + }, + }, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -178,6 +154,28 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_mllama(question, image_urls: List[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=4096, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + prompt = f"<|image|><|image|><|begin_of_text|>{question}" + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_nvlm_d(question: str, image_urls: List[str]): model_name = "nvidia/NVLM-D-72B" @@ -211,6 +209,80 @@ def load_nvlm_d(question: str, image_urls: List[str]): ) +def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: + # num_crops is an override kwarg to the multimodal image processor; + # For some models, e.g., Phi-3.5-vision-instruct, it is recommended + # to use 16 for single frame scenarios, and 4 for multi-frame. + # + # Generally speaking, a larger value for num_crops results in more + # tokens per image instance, because it may scale the image more in + # the image preprocessing. Some references in the model docs and the + # formula for image tokens after the preprocessing + # transform can be found below. + # + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally + # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={"num_crops": 4}, + ) + placeholders = "\n".join(f"<|image_{i}|>" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" + stop_token_ids = None + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + +def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "Qwen/Qwen-VL-Chat" + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=1024, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + placeholders = "".join(f"Picture {i}: \n" + for i, _ in enumerate(image_urls, start=1)) + + # This model does not have a chat_template attribute on its tokenizer, + # so we need to explicitly pass it. We use ChatML since it's used in the + # generation utils of the model: + # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265 + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + + # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating + chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 + + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True, + chat_template=chat_template) + + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=chat_template, + ) + + def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info @@ -268,88 +340,16 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) -def load_mllama(question, image_urls: List[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - llm = LLM( - model=model_name, - max_model_len=4096, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - prompt = f"<|image|><|image|><|begin_of_text|>{question}" - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=None, - image_data=[fetch_image(url) for url in image_urls], - chat_template=None, - ) - - -def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: - model_name = "HuggingFaceM4/Idefics3-8B-Llama3" - - # The configuration below has been confirmed to launch on a single L40 GPU. - llm = LLM( - model=model_name, - max_model_len=8192, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - # if you are running out of memory, you can reduce the "longest_edge". - # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations - mm_processor_kwargs={ - "size": { - "longest_edge": 2 * 364 - }, - }, - ) - - placeholders = "\n".join(f"Image-{i}: \n" - for i, _ in enumerate(image_urls, start=1)) - prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=None, - image_data=[fetch_image(url) for url in image_urls], - chat_template=None, - ) - - -def load_aria(question, image_urls: List[str]) -> ModelRequestData: - model_name = "rhymes-ai/Aria" - llm = LLM(model=model_name, - tokenizer_mode="slow", - trust_remote_code=True, - dtype="bfloat16", - limit_mm_per_prompt={"image": len(image_urls)}) - placeholders = "<|img|>\n" * len(image_urls) - prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" - "<|im_start|>assistant\n") - stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] - return ModelRequestData( - llm=llm, - prompt=prompt, - stop_token_ids=stop_token_ids, - image_data=[fetch_image(url) for url in image_urls], - chat_template=None) - - model_example_map = { - "phi3_v": load_phi3v, + "aria": load_aria, "h2ovl_chat": load_h2onvl, + "idefics3": load_idefics3, "internvl_chat": load_internvl, + "mllama": load_mllama, "NVLM_D": load_nvlm_d, - "qwen2_vl": load_qwen2_vl, + "phi3_v": load_phi3v, "qwen_vl_chat": load_qwenvl_chat, - "mllama": load_mllama, - "idefics3": load_idefics3, - "aria": load_aria, + "qwen2_vl": load_qwen2_vl, } From efbce85f4d375d7851a491a0126a224e25d9f91d Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 16 Dec 2024 13:14:57 -0500 Subject: [PATCH 44/56] [misc] Layerwise profile updates (#10242) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .buildkite/test-pipeline.yaml | 2 +- examples/offline_profile.py | 236 +++++++++++++++--- tools/profiler/print_layerwise_table.py | 9 +- tools/profiler/visualize_layerwise_profile.py | 92 ++++++- vllm/profiler/layerwise_profile.py | 22 +- 5 files changed, 314 insertions(+), 47 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 97aae233db105..44f47fac1c1b3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -201,7 +201,7 @@ steps: - python3 offline_inference_classification.py - python3 offline_inference_embedding.py - python3 offline_inference_scoring.py - - python3 offline_profile.py --model facebook/opt-125m + - python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] diff --git a/examples/offline_profile.py b/examples/offline_profile.py index 1d415b82cddb6..46afe8aa2604b 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -4,9 +4,10 @@ import sys from argparse import RawTextHelpFormatter from dataclasses import asdict, dataclass -from typing import Optional +from typing import Any, Dict, Generator, List, Optional, TypeAlias import torch +import tqdm from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -15,16 +16,21 @@ BATCH_SIZE_DEFAULT = 1 PROMPT_LEN_DEFAULT = 256 -OUTPUT_LEN_DEFAULT = 2 @dataclass class ProfileContext: engine_args: EngineArgs prompt_len: int - output_len: int batch_size: int - save_chrome_traces_folder: Optional[str] + + # The profiler can run in 2 modes, + # 1. Run profiler for user specified num_steps + num_steps: Optional[int] = None + # 2. Run profiler until all requests complete + complete_num_requests_per_step: Optional[int] = None + + save_chrome_traces_folder: Optional[str] = None def get_dtype(dtype: str): @@ -34,23 +40,155 @@ def get_dtype(dtype: str): return dtype +OutputLen_NumReqs_Map: TypeAlias = Dict[int, int] +def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \ + -> OutputLen_NumReqs_Map: + """ + Given the number of requests, batch_size, and the number of requests + that each engine-step should process, step_requests, determine the + output lengths of the requests such that step_request is honoured. + + Example: + if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1] + then return, + {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning, + 32 requests should have output length 2, + 32 requests should have output length 3, + 32 requests should have output length 4, + 31 requests should have output length 5, + 1 request should have output length 6. + + Args: + batch_size (int): Number of requests submitted for profile. This is + args.batch_size. + step_requests (List[int]): step_requests[i] is the number of requests + that the ith engine step should process. + + Returns: + OutputLen_NumReqs_Map : A dictionary with output-length as keys and the + number of requests required to have that output-length as values. + """ + ol_nr: OutputLen_NumReqs_Map = {} + + # Number of request that are assigned an output-length + num_reqs_assigned: int = 0 + num_steps: int = len(step_requests) + + # sanity check. The first step (prefill-step), must process all requests. + assert step_requests[0] == batch_size + + # Begin assignments from the last step. + output_length: int = num_steps + for num_requests_at_step in reversed(step_requests): + if num_reqs_assigned == batch_size: + break + + assert num_reqs_assigned < batch_size + + # Remove the number of requests that have been determined + # to participate in this step and beyond. + num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned + assert num_reqs_unassigned_at_step >= 0 + + if num_reqs_unassigned_at_step > 0: + ol_nr[output_length] = num_reqs_unassigned_at_step + num_reqs_assigned += num_reqs_unassigned_at_step + + output_length -= 1 + + # sanity checks. + assert sum(ol_nr.values()) == batch_size, \ + ("Number of requests in output-length assignment does not match " + f"batch-size.\n batch size {batch_size} - " + f"step requests {step_requests} - assignments {ol_nr}") + + # Check that the output-length is in [1, num-steps]. Output length must be + # at least 1 as all requests must participate in the prefill-step. + assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \ + ("Output lengths of requests should be in range " + f"[1, num-engine-steps].\n batch size {batch_size} - " + f"step requests {step_requests} - assignments {ol_nr}") + + return ol_nr + + +def determine_requests_per_step(context: ProfileContext) -> List[int]: + """ + Determine number of requests each engine step should process. + If context.num_steps is set, then all engine steps process the + same number of requests and the output list is of length + context.num_steps. + + If context.complete_num_requests_per_step is set, then each decode step + processes fewer and fewer requests until there are no requests to process. + In this case, the output list is as big as the number of steps + required to process all requests. + + Args: + context: ProfileContext object. + + Returns: + List[int]: Number of requests to process for all engine-steps. + output[i], contains the number of requests that the ith step + should process. + """ + if context.num_steps: + # All requests must run until num_engine_steps. This implies + # that their output lengths must be equal to num_engine_steps. + return [context.batch_size] * context.num_steps + + assert context.complete_num_requests_per_step and \ + context.complete_num_requests_per_step > 0, \ + (f"Expected a positive complete_num_requests_per_step argument." + f"Instead got {context.complete_num_requests_per_step}") + + # We start dropping after the first decode step. + step_requests = [ + context.batch_size, # prefill + context.batch_size, # decode + ] + + num_running_requests = context.batch_size + num_running_requests -= context.complete_num_requests_per_step + while num_running_requests > 0: + step_requests.append(num_running_requests) + num_running_requests -= context.complete_num_requests_per_step + + if step_requests[-1] != 1: + # have 1 request running at the last step. This is often + # useful + step_requests.append(1) + + return step_requests + + def run_profile(context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]): print("Run profile with:") for key, value in asdict(context).items(): print(f" {key} = {value}") + requests_per_step: List[int] = determine_requests_per_step(context) + + ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( + context.batch_size, requests_per_step) + + num_steps_to_profile: int = len(requests_per_step) + max_output_len: int = max(ol_nr.keys()) + assert max_output_len >= 1 + # Create sampling params - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=args.output_len, - ignore_eos=True) + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + # max_tokens is set on a per-request basis. + max_tokens=None, + ignore_eos=True) # Create LLM llm = LLM(**asdict(context.engine_args)) batch_size = context.batch_size prompt_len = context.prompt_len - output_len = context.output_len scheduler_config = llm.llm_engine.scheduler_config max_model_len = llm.llm_engine.model_config.max_model_len @@ -65,7 +203,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], f"choose a smaller batch size or prompt length, or increase " f"--max-num-batched-tokens") sys.exit(-1) - if batch_size >= max_num_seqs: + if batch_size > max_num_seqs: print( f"ERROR: chosen batch_size ({batch_size}) is larger than " f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " @@ -73,16 +211,26 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], sys.exit(-1) print("llm.llm_engine.model_config.max_model_len: ", llm.llm_engine.model_config.max_model_len) - if prompt_len + output_len > llm.llm_engine.model_config.max_model_len: - print( - f"ERROR: chosen prompt_len + output_len ({prompt_len} + " - f"{output_len} = {prompt_len + output_len}) is larger than the " - f"model's max_model_len ({max_model_len}), please choose a smaller " - f"prompt_len or output_len, or increase --max-model-len") + if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len: + print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " + f"{max_output_len} = {prompt_len + max_output_len}) is larger " + f"than the model's max_model_len ({max_model_len}), please " + f"choose a smaller prompt_len or max_output_len, or increase " + f"--max-model-len") sys.exit(-1) def add_requests(): + + def get_output_len_generator() -> Generator[int, Any, Any]: + for output_len, num_reqs in ol_nr.items(): + for _ in range(num_reqs): + yield output_len + + output_len_generator = get_output_len_generator() for i in range(batch_size): + sampling_params.max_tokens = next(output_len_generator) + assert isinstance(sampling_params.max_tokens, int) + prompt_token_ids = torch.randint( llm.llm_engine.model_config.get_vocab_size(), size=(prompt_len, )).tolist() @@ -110,8 +258,11 @@ def abort_requests(): llm.llm_engine.step() # First step is prefill decode_profs = [] - for x in range(args.output_len - 1): - with layerwise_profile() as decode_prof: + for _ in tqdm.tqdm(range(num_steps_to_profile - 1)): + num_running_seqs = llm.llm_engine.scheduler[ + 0].get_num_unfinished_seq_groups() + with layerwise_profile( + num_running_seqs=num_running_seqs) as decode_prof: llm.llm_engine.step() decode_profs.append(decode_prof) @@ -154,7 +305,8 @@ def abort_requests(): decode_results_list[0].print_summary_table() if csv_output: - csv_filename_base = csv_output.rstrip(".csv") + csv_filename_base = csv_output[:-4] \ + if csv_output.endswith('.csv') else csv_output prefill_results.export_model_stats_table_csv( csv_filename_base + "_prefill_model_table.csv") prefill_results.export_summary_stats_table_csv( @@ -187,10 +339,10 @@ def abort_requests(): for idx, dr in enumerate(decode_results_list): json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() - for idx, dr in enumerate(decode_results_list[1:]): - json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() - - with open(json_output.rstrip(".json") + ".json", "w+") as f: + # Add .json to json_output filename if it doesn't exist already. + json_output_file = json_output if json_output.endswith( + '.json') else json_output + '.json' + with open(json_output_file, "w+") as f: json.dump(json_dict, f, indent=2) pass @@ -214,7 +366,7 @@ def abort_requests(): python examples/offline_profile.py \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ - --enforce-eager + --enforce-eager run_num_steps -n 2 ``` then you can use various tools to analyze the json output @@ -261,17 +413,41 @@ def abort_requests(): default=BATCH_SIZE_DEFAULT, help=f"Number of requests to run as a single batch, " f"default={BATCH_SIZE_DEFAULT}") - parser.add_argument( - "--output-len", + + subparsers = parser.add_subparsers(dest="cmd") + + run_num_steps_parser = subparsers.add_parser( + "run_num_steps", + help="This variation profiles n engine.step() invocations.") + run_num_steps_parser.add_argument( + '-n', + '--num-steps', type=int, - default=OUTPUT_LEN_DEFAULT, - help="Number of llm steps to run (includes prefill and decode) " - "- default={OUTPUT_LEN_DEFAULT}") + help="Number of engine steps to profile.\n" + "Setting it to 1, profiles only the prefill step.\n" + "Setting it to 2, profiles the prefill and first decode step\n" + "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n" + "and so on ...") + + run_to_completion_parser = subparsers.add_parser( + "run_to_completion", + help="This variation profiles all the engine.step() invocations" + "until the engine exhausts all submitted requests.") + run_to_completion_parser.add_argument( + '-n', + '--complete-num-requests-per-step', + type=int, + help= + "Complete complete_num_requests_per_step requests every decode step." + "For e.g., with batch_size 128 and complete_num_requests_per_step 32," + "the profiler is run for 6 engine steps, with the steps processing, " + "128, 128, 96, 64, 32, 1 requests respectively.\n" + "Note that we tack-on a one-request step at the end as it is often " + "useful.") EngineArgs.add_cli_args(parser) args = parser.parse_args() - context = ProfileContext( engine_args=EngineArgs.from_cli_args(args), **{ diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 081076ad7dbdc..394ca8663e189 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -34,9 +34,10 @@ def get_entries(node, curr_depth=0): "examples/offline_profile.py") parser.add_argument("--phase", type=str, - choices=["prefill", "decode_1"], required=True, - help="The phase to print the table for.") + help="The phase to print the table for. This is either" + "prefill or decode_n, where n is the decode step " + "number") parser.add_argument("--table", type=str, choices=["summary", "model"], @@ -49,6 +50,10 @@ def get_entries(node, curr_depth=0): with open(args.json_trace) as f: profile_data = json.load(f) + assert args.phase in profile_data, \ + (f"Cannot find phase {args.phase} in profile data. Choose one among" + f'{[x for x in profile_data.keys() if "prefill" in x or "decode" in x]}') #noqa + if args.table == "summary": entries_and_depths = flatten_entries( SummaryStatsEntry, profile_data[args.phase]["summary_stats"]) diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index adc44474aa4c1..da7a28da15c19 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -151,16 +151,31 @@ def is_quant(op_name: str): "scaled_int8_quant" in op_name: return True + # LoRA ops + def is_sgmv_shrink(op_name: str): + return "sgmv_shrink" in op_name + + def is_sgmv_expand(op_name: str): + return "sgmv_expand" in op_name + + def is_bgmv_shrink(op_name: str): + return "bgmv_shrink" in op_name + + def is_bgmv_expand(op_name: str): + return "bgmv_expand" in op_name + + def is_cutlass_gemm_op(op_name: str): + return "void cutlass::Kernel" in op_name or \ + "void cutlass::device_kernel" in op_name + def is_gemm_op(op_name: str): if is_quant(op_name): return False - if "xmma_gemm" in op_name or \ + return is_cutlass_gemm_op(op_name) or \ + "xmma_gemm" in op_name or \ "gemv2T_kernel" in op_name or \ "splitKreduce" in op_name or \ - "void cutlass::Kernel" in op_name or \ - "void cutlass::device_kernel" in op_name or \ - "s16816gemm" in op_name: - return True + "s16816gemm" in op_name def is_elementwise_op(op_name: str): return "elementwise_kernel" in op_name @@ -211,6 +226,18 @@ def is_reduce_kernel(op_name: str): quant_ops = list(filter(lambda x: is_quant(x), ops)) ops = list(filter(lambda x: x not in quant_ops, ops)) + sgmv_shrink_ops = list(filter(lambda x: is_sgmv_shrink(x), ops)) + ops = list(filter(lambda x: x not in sgmv_shrink_ops, ops)) + sgmv_expand_ops = list(filter(lambda x: is_sgmv_expand(x), ops)) + ops = list(filter(lambda x: x not in sgmv_expand_ops, ops)) + bgmv_shrink_ops = list(filter(lambda x: is_bgmv_shrink(x), ops)) + ops = list(filter(lambda x: x not in bgmv_shrink_ops, ops)) + bgmv_expand_ops = list(filter(lambda x: is_bgmv_expand(x), ops)) + ops = list(filter(lambda x: x not in bgmv_expand_ops, ops)) + + cutlass_gemm_ops = list(filter(lambda x: is_cutlass_gemm_op(x), ops)) + ops = list(filter(lambda x: x not in cutlass_gemm_ops, ops)) + gemm_ops = list(filter(lambda x: is_gemm_op(x), ops)) ops = list(filter(lambda x: x not in gemm_ops, ops)) @@ -257,6 +284,24 @@ def is_reduce_kernel(op_name: str): trace_df['attention'] = trace_df[attention_ops].agg("sum", axis=1) if len(quant_ops): trace_df['quant_ops'] = trace_df[quant_ops].agg("sum", axis=1) + + if len(sgmv_shrink_ops): + trace_df['sgmv_shrink_ops'] = trace_df[sgmv_shrink_ops].agg("sum", + axis=1) + if len(sgmv_expand_ops): + trace_df['sgmv_expand_ops'] = trace_df[sgmv_expand_ops].agg("sum", + axis=1) + if len(bgmv_shrink_ops): + trace_df['bgmv_shrink_ops'] = trace_df[bgmv_shrink_ops].agg("sum", + axis=1) + if len(bgmv_expand_ops): + trace_df['bgmv_expand_ops'] = trace_df[bgmv_expand_ops].agg("sum", + axis=1) + + if len(cutlass_gemm_ops): + trace_df['cutlass_gemm_ops'] = trace_df[cutlass_gemm_ops].agg("sum", + axis=1) + if len(gemm_ops): trace_df['gemm_ops'] = trace_df[gemm_ops].agg("sum", axis=1) if len(rms_norm_ops): @@ -296,7 +341,9 @@ def is_reduce_kernel(op_name: str): trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum", axis=1) - trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops + + trace_df.drop(attention_ops + quant_ops + sgmv_shrink_ops + + sgmv_expand_ops + bgmv_shrink_ops + bgmv_expand_ops + + cutlass_gemm_ops + gemm_ops + rms_norm_ops + vocab_embed_ops + mem_ops + elementwise_ops + nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops + nccl_other_ops + cross_device_reduce_1stage_ops + @@ -315,7 +362,14 @@ def plot_trace_df(traces_df: pd.DataFrame, plot_title: str, output: Optional[Path] = None): + def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str: + phase_df = traces_df.query(f'phase == "{phase}"') + descs = phase_df['phase_desc'].to_list() + assert all([desc == descs[0] for desc in descs]) + return descs[0] + phases = traces_df['phase'].unique() + phase_descs = [get_phase_description(traces_df, p) for p in phases] traces_df = traces_df.pivot_table(index="phase", columns="name", values=plot_metric, @@ -324,7 +378,8 @@ def plot_trace_df(traces_df: pd.DataFrame, traces_df = group_trace_by_operations(traces_df) # Make the figure - fig, ax = plt.subplots(1, figsize=(5, 8), sharex=True) + fig_size_x = max(5, len(phases)) + fig, ax = plt.subplots(1, figsize=(fig_size_x, 8), sharex=True) # Draw the stacked bars ops = list(traces_df) @@ -332,7 +387,7 @@ def plot_trace_df(traces_df: pd.DataFrame, for op in ops: values = [traces_df[op][phase] for phase in phases] values = list(map(lambda x: 0.0 if math.isnan(x) else x, values)) - ax.bar(phases, values, label=op, bottom=bottom) + ax.bar(phase_descs, values, label=op, bottom=bottom) bottom = [bottom[j] + values[j] for j in range(len(phases))] # Write the values as text on the bars @@ -390,6 +445,14 @@ def keep_only_top_entries(df: pd.DataFrame, ["name"]] = "others" return df + def get_phase_description(key: str) -> str: + num_running_seqs = profile_json[key]['metadata'][ + 'num_running_seqs'] + if num_running_seqs is not None: + return f"{key}-seqs-{num_running_seqs}" + else: + return key + # Get data for each key traces = list(map(lambda x: get_entries_and_traces(x), step_keys)) @@ -413,6 +476,7 @@ def keep_only_top_entries(df: pd.DataFrame, # Fill in information about the step-keys for trace_df, step_key in zip(trace_dfs, step_keys): trace_df['phase'] = step_key + trace_df['phase_desc'] = get_phase_description(step_key) # Combine all data frames so they can be put in a single plot traces_df = pd.concat(trace_dfs) @@ -426,12 +490,16 @@ def keep_only_top_entries(df: pd.DataFrame, def make_plot_title_suffix(profile_json: dict) -> str: context = profile_json["context"] sparsity = context.get('sparsity', None) - return (f"{context['model']}\n" + run_type = \ + f'Run {context["num_steps"]} steps' if context['num_steps'] else \ + (f'Complete {context["complete_num_requests_per_step"]} per ' + f'step; Run till completion') + return (f"{context['engine_args']['model']}\n" f"Batch={context['batch_size']}, " f"PromptLen={context['prompt_len']}, " - f"OutputLen={context['output_len']}," - f"NumGpus={context['tensor_parallel_size']}" - f"{', Sparsity ' + sparsity if sparsity else ''}") + f"NumGpus={context['engine_args']['tensor_parallel_size']}" + f"{', Sparsity ' + sparsity if sparsity else ''}\n" + f"Run Type: {run_type}") profile_json = None with open(json_trace) as f: diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 9d9f427e807f6..33babfebdca1e 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -72,6 +72,9 @@ class LayerwiseProfileResults(profile): _model_stats_tree: List[_StatsTreeNode] = field(init=False) _summary_stats_tree: List[_StatsTreeNode] = field(init=False) + # profile metadata + num_running_seqs: Optional[int] = None + def __post_init__(self): self._build_correlation_map() self._build_module_tree() @@ -127,6 +130,9 @@ def export_summary_stats_table_csv(self, filename: str): def convert_stats_to_dict(self) -> str: return { + "metadata": { + "num_running_seqs": self.num_running_seqs + }, "summary_stats": self._convert_stats_tree_to_dict(self._summary_stats_tree), "model_stats": @@ -338,7 +344,15 @@ def df_traversal(node: _StatsTreeNode, curr_json_list: List[Dict]): class layerwise_profile(profile): - def __init__(self): + def __init__(self, num_running_seqs: Optional[int] = None): + """ + layerwise profile constructor. + + Args: + num_running_seqs (Optional[int], optional): When given, + num_running_seqs will be passed to LayerProfileResults for metadata + update. Defaults to None. + """ super().__init__( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, @@ -346,9 +360,13 @@ def __init__(self): with_modules=True, experimental_config=_ExperimentalConfig(verbose=True)) + self.num_running_seqs = num_running_seqs + def __enter__(self): return super().__enter__() def __exit__(self, exc_type, exc_val, exc_tb): super().__exit__(exc_type, exc_val, exc_tb) - self.results = LayerwiseProfileResults(self.profiler.kineto_results) + self.results = LayerwiseProfileResults( + self.profiler.kineto_results, + num_running_seqs=self.num_running_seqs) From 551603feffd9b4ba98ccdd34e02e403e04db88c1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 16 Dec 2024 13:32:25 -0800 Subject: [PATCH 45/56] [core] overhaul memory profiling and fix backward compatibility (#10511) Signed-off-by: youkaichao --- tests/entrypoints/llm/test_gpu_utilization.py | 25 ++++ tests/entrypoints/llm/test_lazy_outlines.py | 2 +- tests/test_utils.py | 44 +++++- tests/worker/test_profile.py | 18 +-- vllm/engine/arg_utils.py | 11 +- vllm/utils.py | 125 +++++++++++++++++- vllm/worker/multi_step_model_runner.py | 3 +- vllm/worker/worker.py | 68 ++++------ 8 files changed, 236 insertions(+), 60 deletions(-) create mode 100644 tests/entrypoints/llm/test_gpu_utilization.py diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py new file mode 100644 index 0000000000000..c2dab300ecefb --- /dev/null +++ b/tests/entrypoints/llm/test_gpu_utilization.py @@ -0,0 +1,25 @@ +from vllm import LLM, SamplingParams + + +def test_gpu_memory_utilization(): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # makes sure gpu_memory_utilization is per-instance limit, + # not a global limit + llms = [ + LLM(model="facebook/opt-125m", + gpu_memory_utilization=0.3, + enforce_eager=True) for i in range(3) + ] + for llm in llms: + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 2c53676c5f5dd..bf609b38a94f5 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -36,7 +36,7 @@ def run_lmfe(sample_regex): llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", - gpu_memory_utilization=0.6) + gpu_memory_utilization=0.3) sampling_params = SamplingParams(temperature=0.8, top_p=0.95) outputs = llm.generate( prompts=[ diff --git a/tests/test_utils.py b/tests/test_utils.py index a731b11eae81c..0bc9e5bc32a46 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,11 +5,13 @@ from typing import AsyncIterator, Tuple import pytest +import torch from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs, - get_open_port, merge_async_iterators, supports_kw) + get_open_port, memory_profiling, merge_async_iterators, + supports_kw) -from .utils import error_on_warning +from .utils import error_on_warning, fork_new_process_for_each_test @pytest.mark.asyncio @@ -270,3 +272,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only, requires_kw_only=requires_kw_only, allow_var_kwargs=allow_var_kwargs ) == is_supported + + +@fork_new_process_for_each_test +def test_memory_profiling(): + # Fake out some model loading + inference memory usage to test profiling + # Memory used by other processes will show up as cuda usage outside of torch + from vllm.distributed.device_communicators.cuda_wrapper import ( + CudaRTLibrary) + lib = CudaRTLibrary() + # 512 MiB allocation outside of this instance + handle1 = lib.cudaMalloc(512 * 1024 * 1024) + + baseline_memory_in_bytes = \ + torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0] + + # load weights + + weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32) + + weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB + + with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes, + weights_memory_in_bytes=weights_memory_in_bytes) as result: + # make a memory spike, 1 GiB + spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32) + del spike + + # Add some extra non-torch memory 256 MiB (simulate NCCL) + handle2 = lib.cudaMalloc(256 * 1024 * 1024) + + # Check that the memory usage is within 5% of the expected values + non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa + torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa + assert abs(non_torch_ratio - 1) <= 0.05 + assert abs(torch_peak_ratio - 1) <= 0.05 + del weights + lib.cudaFree(handle1) + lib.cudaFree(handle2) diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 194ea2aa506f4..79233c75714de 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -31,10 +31,6 @@ def test_gpu_memory_profiling(): is_driver_worker=True, ) - # Load the model so we can profile it - worker.init_device() - worker.load_model() - # Set 10GiB as the total gpu ram to be device-agnostic def mock_mem_info(): current_usage = torch.cuda.memory_stats( @@ -46,20 +42,24 @@ def mock_mem_info(): from unittest.mock import patch with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info): + # Load the model so we can profile it + worker.init_device() + worker.load_model() gpu_blocks, _ = worker.determine_num_available_blocks() - # Peak vram usage by torch should be 0.7077 GiB + # Peak vram usage by torch should be 0.47 GiB + # Model weights take 0.25 GiB # No memory should be allocated outside of torch # 9.0 GiB should be the utilization target - # 8.2923 GiB should be available for the KV cache + # 8.28 GiB should be available for the KV cache block_size = CacheEngine.get_cache_block_size( engine_config.cache_config, engine_config.model_config, engine_config.parallel_config) - expected_blocks = (8.2923 * 1024**3) // block_size + expected_blocks = (8.28 * 1024**3) // block_size # Check within a small tolerance for portability # Hardware, kernel, or dependency changes could all affect memory # utilization. - # A 10 block tolerance here should be about 6MB of wiggle room. - assert abs(gpu_blocks - expected_blocks) < 10 + # A 100 block tolerance here should be about 60MB of wiggle room. + assert abs(gpu_blocks - expected_blocks) < 100 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0aa367a173b6c..06b8542779dc0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -487,11 +487,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='The fraction of GPU memory to be used for the model ' 'executor, which can range from 0 to 1. For example, a value of ' '0.5 would imply 50%% GPU memory utilization. If unspecified, ' - 'will use the default value of 0.9. This is a global gpu memory ' - 'utilization limit, for example if 50%% of the gpu memory is ' - 'already used before vLLM starts and --gpu-memory-utilization is ' - 'set to 0.9, then only 40%% of the gpu memory will be allocated ' - 'to the model executor.') + 'will use the default value of 0.9. This is a per-instance ' + 'limit, and only applies to the current vLLM instance.' + 'It does not matter if you have another vLLM instance running ' + 'on the same GPU. For example, if you have two vLLM instances ' + 'running on the same GPU, you can set the GPU memory utilization ' + 'to 0.5 for each instance.') parser.add_argument( '--num-gpu-blocks-override', type=int, diff --git a/vllm/utils.py b/vllm/utils.py index 45e682ac15782..73d2ae25f15ca 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -23,10 +23,12 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task from collections import UserDict, defaultdict from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field from functools import lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, - Dict, Generic, Hashable, List, Literal, Optional, - OrderedDict, Set, Tuple, Type, TypeVar, Union, overload) + Dict, Generator, Generic, Hashable, List, Literal, + Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union, + overload) from uuid import uuid4 import numpy as np @@ -1664,3 +1666,122 @@ def kill_process_tree(pid: int): # Finally kill the parent with contextlib.suppress(ProcessLookupError): os.kill(pid, signal.SIGKILL) + + +@dataclass +class MemorySnapshot: + """Memory snapshot.""" + torch_peak_in_bytes: int = 0 + torch_memory_in_bytes: int = 0 + timestamp: float = 0.0 + + def measure(self): + self.torch_peak_in_bytes = torch.cuda.memory_stats( + )["allocated_bytes.all.peak"] + self.torch_memory_in_bytes = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + self.timestamp = time.time() + + def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": + """support a - b""" + return MemorySnapshot( + torch_peak_in_bytes=self.torch_peak_in_bytes - + other.torch_peak_in_bytes, + torch_memory_in_bytes=self.torch_memory_in_bytes - + other.torch_memory_in_bytes, + timestamp=self.timestamp - other.timestamp) + + +@dataclass +class MemoryProfilingResult: + """Memory profiling result. + """ # noqa + baseline_memory_in_bytes: int = 0 + non_kv_cache_memory_in_bytes: int = 0 + torch_peak_increase_in_bytes: int = 0 + non_torch_increase_in_bytes: int = 0 + weights_memory_in_bytes: float = 0 + before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) + after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) + profile_time: float = 0.0 + + +@contextlib.contextmanager +def memory_profiling( + baseline_memory_in_bytes: int, weights_memory_in_bytes: int +) -> Generator[MemoryProfilingResult, None, None]: + """Memory profiling context manager. + baseline_memory_in_bytes: memory used by all the components other than + the current vLLM instance. It contains: memory used by other processes, memory + used by another vLLM instance in the same process, etc. It is usually measured + before the current vLLM instance initialize the device. And we assume it is + constant during the profiling of the current vLLM instance. + weights_memory_in_bytes: memory used by PyTorch when loading the model weights. + Note that, before loading the model weights, we also initialize the device + and distributed environment, which may consume some memory. This part is not + included in the weights_memory_in_bytes because PyTorch does not control it. + + The memory in one GPU can be classified into 3 categories: + 1. memory used by anything other than the current vLLM instance. + 2. memory used by torch in the current vLLM instance. + 3. memory used in the current vLLM instance, but not by torch. + + A quantitive example: + + Before creating the current vLLM instance: + category 1: 1 GiB + category 2: 0 GiB + category 3: 0 GiB + + After creating the current vLLM instance and loading the model, + (i.e. before profiling): + category 1: 1 GiB + category 2: 2 GiB (model weights take 2 GiB) + category 3: 0.5 GiB (memory used by NCCL) + + During profiling (peak): + category 1: 1 GiB + category 2: 4 GiB (peak activation tensors take 2 GiB) + category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) + + After profiling: + category 1: 1 GiB + category 2: 3 GiB (after garbage-collecting activation tensors) + category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) + + In this case, non-kv cache takes 5 GiB in total, including: + a. 2 GiB used by the model weights (category 2) + b. 2 GiB reserved for the peak activation tensors (category 2) + c. 1 GiB used by non-torch components (category 3) + + The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. + + The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). + + (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), + subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`. + """ # noqa + torch.cuda.reset_peak_memory_stats() + + result = MemoryProfilingResult() + + result.baseline_memory_in_bytes = baseline_memory_in_bytes + # the part of memory used for holding the model weights + result.weights_memory_in_bytes = weights_memory_in_bytes + + result.before_profile.measure() + + yield result + + gc.collect() + torch.cuda.empty_cache() + + result.after_profile.measure() + + diff = result.after_profile - result.before_profile + result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes + current_cuda_memory_bytes = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa + result.profile_time = diff.timestamp + result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index e08a61e31fe42..18b03bf1bfb56 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -645,7 +645,8 @@ def _advance_step(self, model_input: StatefulModelInput, return model_input def load_model(self) -> None: - return self._base_model_runner.load_model() + self._base_model_runner.load_model() + self.model_memory_usage = self._base_model_runner.model_memory_usage def save_sharded_state( self, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index a368bb9ee9a5b..f51b51d433d3d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,7 +1,6 @@ """A GPU worker class.""" import gc import os -import time from typing import Dict, List, Optional, Set, Tuple, Type, Union import torch @@ -22,6 +21,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) +from vllm.utils import GiB_bytes, memory_profiling from vllm.worker.cache_engine import CacheEngine from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -192,33 +192,22 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: torch.cuda.reset_peak_memory_stats() free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() - start_time = time.time() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - self.model_runner.profile_run() - torch.cuda.synchronize() + with memory_profiling(baseline_memory_in_bytes=total_gpu_memory - + self.init_gpu_memory, + weights_memory_in_bytes=self.model_runner. + model_memory_usage) as result: + self.model_runner.profile_run() + torch.cuda.synchronize() self._assert_memory_footprint_increased_during_profiling() - # Get the peak memory allocation recorded by torch - peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] - - # Check for any memory left around that may have been allocated on the - # gpu outside of `torch`. NCCL operations, for example, can use a few - # GB during a forward pass - torch.cuda.empty_cache() - torch_allocated_bytes = torch.cuda.memory_stats( - )["allocated_bytes.all.current"] - total_allocated_bytes = torch.cuda.mem_get_info( - )[1] - torch.cuda.mem_get_info()[0] - non_torch_allocations = total_allocated_bytes - torch_allocated_bytes - if non_torch_allocations > 0: - peak_memory += non_torch_allocations - - available_kv_cache_memory = ( - total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) + memory_for_current_instance = total_gpu_memory * \ + self.cache_config.gpu_memory_utilization + available_kv_cache_memory = (memory_for_current_instance - + result.non_kv_cache_memory_in_bytes) # Calculate the number of blocks that can be allocated with the # profiled peak memory. @@ -233,24 +222,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) - end_time = time.time() - logger.info( - "Memory profiling results: " - "duration=%.2f seconds, " - "total_gpu_memory=%.2fGiB, " - "initial_memory_usage=%.2fGiB, " - "peak_torch_memory=%.2fGiB, " - "memory_usage_post_profile=%.2fGiB, " - "non_torch_memory=%.2fGiB, " - "kv_cache_size=%.2fGiB, " - "gpu_memory_utilization=%.2f.", end_time - start_time, - total_gpu_memory / (1024**3), - (total_gpu_memory - free_memory_pre_profile) / (1024**3), - (peak_memory - non_torch_allocations) / (1024**3), - total_allocated_bytes / (1024**3), - non_torch_allocations / (1024**3), - available_kv_cache_memory / (1024**3), - self.cache_config.gpu_memory_utilization) + msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n" + "the current vLLM instance can use " + "total_gpu_memory " + f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" + " x gpu_memory_utilization " + f"({self.cache_config.gpu_memory_utilization:.2f})" + f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" + "model weights take " + f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;" + " non_torch_memory takes " + f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;" + " PyTorch activation peak memory takes " + f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;" + " the rest of the memory reserved for KV Cache is " + f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.") + + logger.info(msg) # Final cleanup if self.model_runner.lora_manager: From 35ffa682b1cd3f47eb6cda586a16dab5c0401477 Mon Sep 17 00:00:00 2001 From: bk-TurbaAI Date: Mon, 16 Dec 2024 23:20:39 +0100 Subject: [PATCH 46/56] [Docs] hint to enable use of GPU performance counters in profiling tools for multi-node distributed serving (#11235) Co-authored-by: Michael Goin --- docs/source/serving/distributed_serving.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index 4d57206e53a05..b24ba53e59694 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -54,7 +54,7 @@ Multi-Node Inference and Serving If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided a helper `script `_ to start the cluster. +The first step, is to start containers and organize them into a cluster. We have provided a helper `script `_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command. Pick a node as the head node, and run the following command: From c301616ed23fef433db1a49df332b9d61d3178ad Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 16 Dec 2024 15:53:18 -0800 Subject: [PATCH 47/56] [ci][tests] add gh200 tests (#11244) Signed-off-by: youkaichao --- .buildkite/run-gh200-test.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .buildkite/run-gh200-test.sh diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh new file mode 100644 index 0000000000000..d25510c47fe6b --- /dev/null +++ b/.buildkite/run-gh200-test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# This script build the GH200 docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +DOCKER_BUILDKIT=1 docker build . \ + --target test \ + -platform "linux/arm64" \ + -t gh200-test \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ + --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg vllm_fa_cmake_gpu_arches="90-real" + +# Setup cleanup +remove_docker_container() { docker rm -f gh200-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and test offline inference +docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' + python3 examples/offline_inference.py +' From 88a412ed3d964de3443c42a6a35108115ee0ad25 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 16 Dec 2024 16:15:22 -0800 Subject: [PATCH 48/56] [torch.compile] fast inductor (#11108) Signed-off-by: youkaichao Co-authored-by: Tyler Michael Smith --- vllm/compilation/backends.py | 213 +++++++++++++++++- vllm/config.py | 415 ++++++++++++++++++++++++++++++++++- vllm/envs.py | 3 + 3 files changed, 624 insertions(+), 7 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 4a5dc337d01b8..0c7bbfe599b02 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,6 +1,10 @@ +import ast import copy import dataclasses +import os +import pprint import time +from collections import defaultdict from contextlib import ExitStack from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple from unittest.mock import patch @@ -21,6 +25,122 @@ logger = init_logger(__name__) +class InductorHashCache: + """ + Disk format: a Python list of tuples, each tuple is + (runtime_shape, graph_index, hash_str) + We use list of tuple for readability. + + In-memory format: a defaultdict of dict, where the key is + runtime_shape, and the value is a dict of graph_index to hash_str. + + The data is essentially `Dict[Optional[int], Dict[int, str]]`, + we don't use json here because json doesn't support int as key. + + TODO: better off-the-shelf solution to serialize the data? + """ + + def __init__(self, cache_dir: str, disabled: bool = False): + self.cache: defaultdict = defaultdict(dict) + self.disabled = disabled + self.cache_dir = cache_dir + self.cache_file_path = os.path.join(cache_dir, + "inductor_hash_cache.py") + if disabled: + return + # set flags so that Inductor and Triton store their cache + # in the cache_dir, then users only need to copy the cache_dir + # to another machine to reuse the cache. + inductor_cache = os.path.join(cache_dir, "inductor_cache") + os.makedirs(inductor_cache, exist_ok=True) + os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache + triton_cache = os.path.join(cache_dir, "triton_cache") + os.makedirs(triton_cache, exist_ok=True) + os.environ["TRITON_CACHE_DIR"] = triton_cache + if os.path.exists(self.cache_file_path): + with open(self.cache_file_path) as f: + self.deserialize(f.read()) + + def deserialize(self, data: str): + # we use ast.literal_eval to parse the data + # because it is a safe way to parse Python literals. + # do not use eval(), it is unsafe. + list_data = ast.literal_eval(data) + for runtime_shape, graph_index, hash_str in list_data: + self.cache[runtime_shape][graph_index] = hash_str + + def serialize(self) -> str: + data = [] + for runtime_shape, graph_index_to_hash_str in self.cache.items(): + for graph_index, hash_str in graph_index_to_hash_str.items(): + data.append((runtime_shape, graph_index, hash_str)) + printer = pprint.PrettyPrinter(indent=4) + return printer.pformat(data) + + def save_to_file(self): + if self.disabled: + return + with open(self.cache_file_path, "w") as f: + f.write(self.serialize()) + + def __contains__(self, key: Tuple[Optional[int], int]) -> bool: + if self.disabled: + return False + runtime_shape, graph_index = key + return runtime_shape in self.cache and graph_index in self.cache[ + runtime_shape] + + def __getitem__(self, key: Tuple[Optional[int], int]) -> str: + if self.disabled: + raise KeyError("cannot read from disabled cache") + runtime_shape, graph_index = key + return self.cache[runtime_shape][graph_index] + + def __setitem__(self, key: Tuple[Optional[int], int], value: str): + # setitem for disabled cache is fine, because we + # don't actually write to the disk + runtime_shape, graph_index = key + self.cache[runtime_shape][graph_index] = value + + +class AlwaysHitShapeEnv: + """ + Why do we need this class: + + For normal `torch.compile` usage, every compilation will have + one Dynamo bytecode compilation and one Inductor compilation. + The Inductor compilation happens under the context of the + Dynamo bytecode compilation, and that context is used to + determine the dynamic shape information, etc. + + For our use case, we only run Dynamo bytecode compilation once, + and run Inductor compilation multiple times with different shapes + plus a general shape. The compilation for specific shapes happens + outside of the context of the Dynamo bytecode compilation. At that + time, we don't have shape environment to provide to Inductor, and + it will fail the Inductor code cache lookup. + + By providing a dummy shape environment that always hits, we can + make the Inductor code cache lookup always hit, and we can + compile the graph for different shapes as needed. + + The following dummy methods are obtained by trial-and-error + until it works. + """ + + def __init__(self) -> None: + self.guards: List[Any] = [] + + def evaluate_guards_expression(self, *args, **kwargs): + return True + + def get_pruned_guards(self, *args, **kwargs): + return [] + + def produce_guards_expression(self, *args, **kwargs): + return "" + + def wrap_inductor(graph, example_inputs, additional_inductor_config, @@ -55,9 +175,93 @@ def wrap_inductor(graph, # inductor can inplace modify the graph, so we need to copy it # see https://github.com/pytorch/pytorch/issues/138980 graph = copy.deepcopy(graph) - compiled_graph = compile_fx(graph, - example_inputs, - config_patches=current_config) + + cache_data = compilation_config.inductor_hash_cache + if (runtime_shape, graph_index) in cache_data: + # we compiled this graph before + # so we can directly lookup the compiled graph via hash + hash_str = cache_data[(runtime_shape, graph_index)] + if graph_index == 0: + # adds some info logging for the first graph + logger.info( + "Directly lookup the graph for shape %s from the cache", + str(runtime_shape)) # noqa + logger.debug( + "directly lookup the %s-th graph for shape %s via hash %s", + graph_index, str(runtime_shape), hash_str) + from torch._inductor.codecache import FxGraphCache + with patch("torch._inductor.codecache.FxGraphCache._get_shape_env", + lambda *args, **kwargs: AlwaysHitShapeEnv()): + inductor_compiled_graph = FxGraphCache._lookup_graph( + hash_str, example_inputs, True, False) + assert inductor_compiled_graph is not None, ( + "Inductor cache lookup failed. Please remove" + f"the cache file {compilation_config.inductor_hash_cache.cache_file_path} and try again." # noqa + ) + + # Inductor calling convention (function signature): + # f(list) -> tuple + # Dynamo calling convention (function signature): + # f(*args) -> Any + + # need to know if the graph returns a tuple + from torch._inductor.compile_fx import graph_returns_tuple + returns_tuple = graph_returns_tuple(graph) + + # this is the graph we return to Dynamo to run + def compiled_graph(*args): + # convert args to list + list_args = list(args) + graph_output = inductor_compiled_graph(list_args) + # unpack the tuple if needed + if returns_tuple: + return graph_output + else: + return graph_output[0] + else: + # it's the first time we compile this graph + # the assumption is that we don't have nested Inductor compilation. + # compiled_fx_graph_hash will only be called once, and we can hook + # it to get the hash of the compiled graph directly. + from torch._inductor.codecache import compiled_fx_graph_hash + + def hijack_compiled_fx_graph_hash(*args, **kwargs): + out = compiled_fx_graph_hash(*args, **kwargs) + # store the hash in the cache + nonlocal cache_data + cache_data[(runtime_shape, graph_index)] = out[0] + if graph_index == 0: + # adds some info logging for the first graph + logger.info("Cache the graph of shape %s for later use", + str(runtime_shape)) + logger.debug("store the %s-th graph for shape %s via hash %s", + graph_index, str(runtime_shape), out[0]) + return out + + def _check_can_cache(*args, **kwargs): + # no error means it can be cached. + # Inductor refuses to cache the graph outside of Dynamo + # tracing context, and also disables caching for graphs + # with high-order ops. + # For vLLM, in either case, we want to cache the graph. + # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa + return + + def _get_shape_env(): + return AlwaysHitShapeEnv() + + with patch(# for hijacking the hash of the compiled graph + "torch._inductor.codecache.compiled_fx_graph_hash", + hijack_compiled_fx_graph_hash), \ + patch(# for providing a dummy shape environment + "torch._inductor.codecache.FxGraphCache._get_shape_env", + _get_shape_env), \ + patch(# for forcing the graph to be cached + "torch._inductor.codecache.FxGraphCache._check_can_cache", + _check_can_cache): + compiled_graph = compile_fx(graph, + example_inputs, + config_patches=current_config) # after compiling the last graph, record the end time if graph_index == num_graphs - 1: @@ -457,6 +661,9 @@ def __call__(self, *args) -> Any: # finished compilations for all required shapes if self.is_last_graph and not self.to_be_compiled_sizes: + + # save the hash of the inductor graph for the next run + self.compilation_config.inductor_hash_cache.save_to_file() end_monitoring_torch_compile(self.vllm_config) if not entry.use_cudagraph: diff --git a/vllm/config.py b/vllm/config.py index fce8011be4015..9cfd08024ea7b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3,6 +3,7 @@ import enum import hashlib import json +import os import warnings from contextlib import contextmanager from dataclasses import dataclass, field, replace @@ -162,6 +163,30 @@ class ModelConfig: which allows no processors. """ + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.model) + factors.append(self.dtype) + factors.append(self.quantization) + factors.append(self.quantization_param_path) + factors.append(self.revision) + factors.append(self.code_revision) + factors.append(self.trust_remote_code) + factors.append(self.rope_scaling) + factors.append(self.rope_theta) + return hashlib.sha256(str(factors).encode()).hexdigest() + def __init__(self, model: str, task: Union[TaskOption, Literal["draft"]], @@ -203,6 +228,8 @@ def __init__(self, self.seed = seed self.revision = revision self.code_revision = code_revision + self.rope_scaling = rope_scaling + self.rope_theta = rope_theta if hf_overrides is None: hf_overrides = {} @@ -832,6 +859,24 @@ class CacheConfig: cpu_offload_gb: Size of the CPU offload buffer in GiB. """ + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.cache_dtype) + # `cpu_offload_gb` does not use `torch.compile` yet. + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __init__( self, block_size: int, @@ -928,6 +973,24 @@ class TokenizerPoolConfig: pool_type: Union[str, Type["BaseTokenizerGroup"]] extra_config: dict + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self): if self.pool_type not in ("ray", ) and not isinstance( self.pool_type, type): @@ -1010,6 +1073,24 @@ class LoadConfig: default_factory=dict) ignore_patterns: Optional[Union[List[str], str]] = None + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self): model_loader_extra_config = self.model_loader_extra_config or {} if isinstance(model_loader_extra_config, str): @@ -1073,6 +1154,19 @@ class ParallelConfig: rank: int = 0 + def compute_hash(self): + """ + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.pipeline_parallel_size) + factors.append(self.tensor_parallel_size) + return hashlib.sha256(str(factors).encode()).hexdigest() + def __post_init__(self) -> None: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size @@ -1209,6 +1303,24 @@ class SchedulerConfig: chunked_prefill_enabled: bool = field(init=False) + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self) -> None: if self.max_num_batched_tokens is None: if self.enable_chunked_prefill: @@ -1286,6 +1398,25 @@ class DeviceConfig: device: Optional[torch.device] device_type: str + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # the device/platform information will be summarized + # by torch/vllm automatically. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __init__(self, device: str = "auto") -> None: if device == "auto": # Automated device type detection @@ -1313,6 +1444,24 @@ class SpeculativeConfig: decoding with top-1 proposals. """ + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # spec decode does not use `torch.compile` yet. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + @staticmethod def maybe_create_spec_config( target_model_config: ModelConfig, @@ -1753,6 +1902,24 @@ class LoRAConfig: long_lora_scaling_factors: Optional[Tuple[float]] = None bias_enabled: bool = False + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # LoRA is not compatible with `torch.compile` . + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self): # Setting the maximum rank to 256 should be able to satisfy the vast # majority of applications. @@ -1802,6 +1969,24 @@ class PromptAdapterConfig: max_cpu_prompt_adapters: Optional[int] = None prompt_adapter_dtype: Optional[torch.dtype] = None + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self): if self.max_prompt_adapters < 1: @@ -1830,6 +2015,24 @@ class MultiModalConfig: for each :class:`~vllm.multimodal.MultiModalPlugin`. """ + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + # TODO: Add configs to init vision tower or not. @@ -1869,6 +2072,24 @@ class PoolerConfig: ``math-shepherd-mistral-7b-prm`` model. """ + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + @staticmethod def from_json(json_str: str) -> "PoolerConfig": return PoolerConfig(**json.loads(json_str)) @@ -2103,6 +2324,24 @@ class DecodingConfig: # 'outlines' / 'lm-format-enforcer' / 'xgrammar' guided_decoding_backend: str = 'xgrammar' + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self): valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar'] backend = self.guided_decoding_backend @@ -2124,6 +2363,24 @@ class ObservabilityConfig: # If set, collects the model execute time for the request. collect_model_execute_time: bool = False + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + def __post_init__(self): if not is_otel_available() and self.otlp_traces_endpoint is not None: raise ValueError( @@ -2165,6 +2422,24 @@ class KVTransferConfig(BaseModel): # The KV connector port, used to build distributed connection kv_port: int = 14579 + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: List[Any] = [] + hash_str = hashlib.md5(str(factors).encode()).hexdigest() + return hash_str + @classmethod def from_cli(cls, cli_value: str) -> "KVTransferConfig": """Parse the CLI value for the kv cache transfer config.""" @@ -2234,6 +2509,9 @@ class CompilationConfig(BaseModel): - 2: dynamo once. - 3: piecewise compilation. - debug_dump_path: the path to dump the debug information. + - cache_dir: the directory to store the compiled graph, to + accelerate Inductor compilation. By default, it will use + model-related information to generate a cache directory. - backend: the backend for compilation. It needs to be a string. - "" (empty string): use the default backend. - "eager"/"openxla"/...: use the specified backend registered in PyTorch. @@ -2302,12 +2580,10 @@ class CompilationConfig(BaseModel): """ # noqa level: int = 0 debug_dump_path: str = "" + cache_dir: str = "" backend: str = "" custom_ops: List[str] = Field(default_factory=list) - splitting_ops: List[str] = Field(default_factory=lambda: [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - ]) + splitting_ops: List[str] = Field(default=None) # type: ignore use_inductor: bool = True candidate_compile_sizes: Optional[List[int]] = Field(default=None) @@ -2371,12 +2647,37 @@ def model_post_init(self, __context: Any) -> None: enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr compilation_time: float = PrivateAttr + # should be InductorHashCache, but Pydantic does not support it + inductor_hash_cache: Any = PrivateAttr # Per-model forward context # Mainly used to store attention cls # Map from layer name to the attention cls static_forward_context: Dict[str, Any] = PrivateAttr + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + factors.append(self.level) + factors.append(self.backend) + factors.append(self.custom_ops) + factors.append(self.splitting_ops) + factors.append(self.use_inductor) + factors.append(self.inductor_compile_config) + factors.append(self.inductor_passes) + factors.append(self.pass_config.uuid()) + return hashlib.sha256(str(factors).encode()).hexdigest() + def __repr__(self) -> str: exclude = { "static_forward_context", @@ -2405,6 +2706,27 @@ def model_post_init(self, __context: Any) -> None: count_all = self.custom_ops.count("all") assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + if self.splitting_ops is None: + if envs.VLLM_USE_V1: + # v1 must split the graph on attention ops + # for piecewise cudagraph + self.splitting_ops = [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + ] + else: + # v0 can use full graph compilation without splitting, + # splitting is optional. + # right now we still need it. kv cache shape + # will be included in the graph if we don't split + # the graph. + # TODO: hide kv cache in static forward context + # so that inductor does not see it. + self.splitting_ops = [ + "vllm.unified_attention", + "vllm.unified_attention_with_output", + ] + for k, v in self.inductor_passes.items(): if not isinstance(v, str): assert callable(v), ( @@ -2444,6 +2766,30 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: # TODO: pass user-specified backend to piecewise compilation # merge with the config use_inductor assert self.level == CompilationLevel.PIECEWISE + + if not self.cache_dir: + # no provided cache dir, generate one based on the known factors + # that affects the compilation. if none of the factors change, + # the cache dir will be the same so that we can reuse the compiled + # graph. + hash_key = vllm_config.compute_hash() + cache_dir = os.path.join( + envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, + f"rank_{vllm_config.parallel_config.rank}") + os.makedirs(cache_dir, exist_ok=True) + self.cache_dir = cache_dir + + disabled = envs.VLLM_DISABLE_COMPILE_CACHE + from vllm.compilation.backends import InductorHashCache + self.inductor_hash_cache: InductorHashCache = InductorHashCache( + self.cache_dir, disabled=disabled) + if disabled: + logger.info("vLLM's torch.compile cache is disabled.") + else: + logger.info( + "Using cache directory: %s for vLLM's torch.compile", + self.cache_dir) + from vllm.compilation.backends import VllmBackend return VllmBackend(vllm_config) @@ -2520,6 +2866,67 @@ class VllmConfig: init=True) # type: ignore instance_id: str = "" + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: List[Any] = [] + # summarize system state + from torch._inductor.codecache import CacheBase + system_factors = CacheBase.get_system() + factors.append(system_factors) + + # summarize pytorch state + from torch._inductor.codecache import torch_key + torch_factors = torch_key() + factors.append(torch_factors) + + # summarize vllm config + vllm_factors: List[Any] = [] + from vllm import __version__ + vllm_factors.append(__version__) + if self.model_config: + vllm_factors.append(self.model_config.compute_hash()) + if self.cache_config: + vllm_factors.append(self.cache_config.compute_hash()) + if self.parallel_config: + vllm_factors.append(self.parallel_config.compute_hash()) + if self.scheduler_config: + vllm_factors.append(self.scheduler_config.compute_hash()) + if self.device_config: + vllm_factors.append(self.device_config.compute_hash()) + if self.load_config: + vllm_factors.append(self.load_config.compute_hash()) + if self.lora_config: + vllm_factors.append(self.lora_config.compute_hash()) + if self.speculative_config: + vllm_factors.append(self.speculative_config.compute_hash()) + if self.decoding_config: + vllm_factors.append(self.decoding_config.compute_hash()) + if self.observability_config: + vllm_factors.append(self.observability_config.compute_hash()) + if self.prompt_adapter_config: + vllm_factors.append(self.prompt_adapter_config.compute_hash()) + if self.quant_config: + pass # should be captured by model_config.quantization + if self.compilation_config: + vllm_factors.append(self.compilation_config.compute_hash()) + if self.kv_transfer_config: + vllm_factors.append(self.kv_transfer_config.compute_hash()) + + factors.append(vllm_factors) + + hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] + return hash_str + def pad_for_cudagraph(self, batch_size: int) -> int: # if batch_size > self.compilation_config.max_capture_size, # it should raise an IndexError. diff --git a/vllm/envs.py b/vllm/envs.py index da17b747ea215..18870c1c6b51a 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -71,6 +71,7 @@ VLLM_USE_V1: bool = False VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 + VLLM_DISABLE_COMPILE_CACHE: bool = False def get_default_cache_root(): @@ -463,6 +464,8 @@ def get_default_config_root(): lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), + "VLLM_DISABLE_COMPILE_CACHE": + lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), } # end-env-vars-definition From 35bae114a89e03e3dc6a6d2f758378e58938bffa Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 16 Dec 2024 17:22:38 -0800 Subject: [PATCH 49/56] fix gh200 tests on main (#11246) Signed-off-by: youkaichao --- .buildkite/run-gh200-test.sh | 4 ++-- docs/source/serving/deploying_with_docker.rst | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index d25510c47fe6b..d06604f96f2b8 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -6,8 +6,8 @@ set -ex # Try building the docker image DOCKER_BUILDKIT=1 docker build . \ - --target test \ - -platform "linux/arm64" \ + --target vllm-openai \ + --platform "linux/arm64" \ -t gh200-test \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst index 11a9f12fd17cd..56f0020a1011a 100644 --- a/docs/source/serving/deploying_with_docker.rst +++ b/docs/source/serving/deploying_with_docker.rst @@ -54,16 +54,13 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB) $ DOCKER_BUILDKIT=1 sudo docker build . \ --target vllm-openai \ - -platform "linux/arm64" \ + --platform "linux/arm64" \ -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" - - - To run vLLM: .. code-block:: console From 0064f697d318a2ce38342f7c20754cf229311b8b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 16 Dec 2024 22:39:58 -0500 Subject: [PATCH 50/56] [CI] Add test case with JSON schema using references + use xgrammar by default with OpenAI parse (#10935) Signed-off-by: mgoin --- tests/entrypoints/conftest.py | 39 +++++++++++++++++++ tests/entrypoints/llm/test_guided_generate.py | 28 +++++++++++++ vllm/entrypoints/openai/protocol.py | 2 +- 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 0f7d15e1d85aa..ef74062ce4b41 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -100,6 +100,45 @@ def sample_complex_json_schema(): } +@pytest.fixture +def sample_definition_json_schema(): + return { + '$defs': { + 'Step': { + 'properties': { + 'explanation': { + 'title': 'Explanation', + 'type': 'string' + }, + 'output': { + 'title': 'Output', + 'type': 'string' + } + }, + 'required': ['explanation', 'output'], + 'title': 'Step', + 'type': 'object' + } + }, + 'properties': { + 'steps': { + 'items': { + '$ref': '#/$defs/Step' + }, + 'title': 'Steps', + 'type': 'array' + }, + 'final_answer': { + 'title': 'Final Answer', + 'type': 'string' + } + }, + 'required': ['steps', 'final_answer'], + 'title': 'MathReasoning', + 'type': 'object' + } + + @pytest.fixture def sample_guided_choice(): return [ diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index de6257cfc551c..ed50ec6bbc9eb 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -104,6 +104,34 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm): schema=sample_complex_json_schema) +@pytest.mark.skip_global_cleanup +def test_guided_definition_json_completion(sample_definition_json_schema, llm): + sampling_params = SamplingParams(temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams( + json=sample_definition_json_schema)) + outputs = llm.generate(prompts=[ + f"Give an example JSON for solving 8x + 7 = -23 " + f"that fits this schema: {sample_definition_json_schema}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, + schema=sample_definition_json_schema) + + @pytest.mark.skip_global_cleanup def test_guided_choice_completion(sample_guided_choice, llm): sampling_params = SamplingParams( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6ed7c2e9dcd6b..5a70e0952666b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -387,7 +387,7 @@ def to_sampling_params( assert json_schema is not None self.guided_json = json_schema.json_schema if self.guided_decoding_backend is None: - self.guided_decoding_backend = "lm-format-enforcer" + self.guided_decoding_backend = "xgrammar" guided_decoding = GuidedDecodingParams.from_optional( json=self._get_guided_json_from_tool() or self.guided_json, From 66d4b16724226e9f377551198cc7425c12ddafae Mon Sep 17 00:00:00 2001 From: kYLe Date: Tue, 17 Dec 2024 00:09:58 -0600 Subject: [PATCH 51/56] [Frontend] Add OpenAI API support for input_audio (#11027) Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- .../serving/openai_compatible_server.md | 10 +- docs/source/usage/multimodal_inputs.rst | 90 ++++++++++++- ...i_chat_completion_client_for_multimodal.py | 34 ++++- tests/entrypoints/openai/test_audio.py | 125 +++++++++++++++++- vllm/entrypoints/chat_utils.py | 65 +++++++-- 5 files changed, 301 insertions(+), 23 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 14a5b02d72aa5..1bc8d32d2d161 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -34,11 +34,6 @@ We currently support the following OpenAI APIs: - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - - *Note: `image_url.detail` parameter is not supported.* - - We also support `audio_url` content type for audio files. - - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. - - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). @@ -209,6 +204,11 @@ The following extra parameters are supported: Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details. +We support both [Vision](https://platform.openai.com/docs/guides/vision)- and +[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; +see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +- *Note: `image_url.detail` parameter is not supported.* + #### Extra parameters The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index 1e00f26f9a3ba..680382e457cc5 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -315,7 +315,95 @@ You can use `these tests `_. +Here is a simple example using Ultravox-v0.3. + +First, launch the OpenAI-compatible server: + +.. code-block:: bash + + vllm serve fixie-ai/ultravox-v0_3 + +Then, you can use the OpenAI client as follows: + +.. code-block:: python + + import base64 + import requests + from openai import OpenAI + from vllm.assets.audio import AudioAsset + + def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Any format supported by librosa is supported + audio_url = AudioAsset("winning_call").url + audio_base64 = encode_base64_content_from_url(audio_url) + + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) + +Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input: + +.. code-block:: python + + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py index 0ec4f71dddf93..6a160fd70423f 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -153,10 +153,37 @@ def run_multi_image() -> None: # Audio input inference def run_audio() -> None: - # Any format supported by librosa is supported audio_url = AudioAsset("winning_call").url + audio_base64 = encode_base64_content_from_url(audio_url) + + # OpenAI-compatible schema (`input_audio`) + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "input_audio", + "input_audio": { + # Any format supported by librosa is supported + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) - # Use audio url in the payload + # HTTP URL chat_completion_from_url = client.chat.completions.create( messages=[{ "role": @@ -169,6 +196,7 @@ def run_audio() -> None: { "type": "audio_url", "audio_url": { + # Any format supported by librosa is supported "url": audio_url }, }, @@ -181,7 +209,7 @@ def run_audio() -> None: result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) - audio_base64 = encode_base64_content_from_url(audio_url) + # base64 URL chat_completion_from_base64 = client.chat.completions.create( messages=[{ "role": diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index b579dcbb5c402..0a29d77e73abc 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -155,6 +155,61 @@ async def test_single_chat_session_audio_base64encoded( assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +async def test_single_chat_session_input_audio( + client: openai.AsyncOpenAI, model_name: str, audio_url: str, + base64_encoded_audio: Dict[str, str]): + messages = [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav" + } + }, + { + "type": "text", + "text": "What's happening in this audio?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=202, total_tokens=212) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @@ -212,11 +267,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, assert "".join(chunks) == output +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, + model_name: str, audio_url: str, + base64_encoded_audio: Dict[str, + str]): + messages = [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav" + } + }, + { + "type": "text", + "text": "What's happening in this audio?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, - audio_url: str): + audio_url: str, + base64_encoded_audio: Dict[str, str]): messages = [{ "role": @@ -229,9 +345,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, } }, { - "type": "audio_url", - "audio_url": { - "url": audio_url + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav" } }, { diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aaa5cd759366a..3df08c740d65b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -13,7 +13,8 @@ # yapf conflicts with isort for this block # yapf: disable from openai.types.chat import (ChatCompletionAssistantMessageParam, - ChatCompletionContentPartImageParam) + ChatCompletionContentPartImageParam, + ChatCompletionContentPartInputAudioParam) from openai.types.chat import ( ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam) from openai.types.chat import (ChatCompletionContentPartRefusalParam, @@ -105,6 +106,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, + ChatCompletionContentPartInputAudioParam, ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, CustomChatCompletionContentSimpleImageParam, CustomChatCompletionContentSimpleAudioParam, @@ -519,6 +521,10 @@ def parse_image(self, image_url: str) -> None: def parse_audio(self, audio_url: str) -> None: raise NotImplementedError + @abstractmethod + def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + raise NotImplementedError + @abstractmethod def parse_video(self, video_url: str) -> None: raise NotImplementedError @@ -545,6 +551,15 @@ def parse_audio(self, audio_url: str) -> None: placeholder = self._tracker.add("audio", audio) self._add_placeholder(placeholder) + def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + input_audio_data = input_audio.get("data","") + input_audio_format = input_audio.get("format","") + audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" + audio = get_and_parse_audio(audio_url) + + placeholder = self._tracker.add("audio", audio) + self._add_placeholder(placeholder) + def parse_video(self, video_url: str) -> None: video = get_and_parse_video(video_url) @@ -574,6 +589,15 @@ def parse_audio(self, audio_url: str) -> None: placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder(placeholder) + def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + input_audio_data = input_audio.get("data","") + input_audio_format = input_audio.get("format","") + audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" + audio_coro = async_get_and_parse_audio(audio_url) + + placeholder = self._tracker.add("audio", audio_coro) + self._add_placeholder(placeholder) + def parse_video(self, video_url: str) -> None: video = async_get_and_parse_video(video_url) @@ -667,17 +691,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _TextParser = partial(cast, ChatCompletionContentPartTextParam) _ImageParser = partial(cast, ChatCompletionContentPartImageParam) _AudioParser = partial(cast, ChatCompletionContentPartAudioParam) +_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) # Define a mapping from part types to their corresponding parsing functions. -MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { +MM_PARSER_MAP: Dict[str, + Callable[[ChatCompletionContentPartParam], + Union[str, Dict[str,str]]]] = { "text": lambda part: _TextParser(part).get("text", ""), "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), + "input_audio": + lambda part: _InputAudioParser(part).get("input_audio", {}), "refusal": lambda part: _RefusalParser(part).get("refusal", ""), "video_url": @@ -686,7 +715,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], def _parse_chat_message_content_mm_part( - part: ChatCompletionContentPartParam) -> Tuple[str, str]: + part: ChatCompletionContentPartParam) -> Tuple[str, + Union[str, Dict[str, str]]]: """ Parses a given multi-modal content part based on its type. @@ -717,6 +747,7 @@ def _parse_chat_message_content_mm_part( return part_type, content # Handle missing 'type' but provided direct URL fields. + # 'type' is required field by pydantic if part_type is None: if part.get("image_url") is not None: image_params = cast(CustomChatCompletionContentSimpleImageParam, @@ -726,6 +757,9 @@ def _parse_chat_message_content_mm_part( audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part) return "audio_url", audio_params.get("audio_url", "") + if part.get("input_audio") is not None: + input_audio_params = cast(Dict[str, str], part) + return "input_audio", input_audio_params if part.get("video_url") is not None: video_params = cast(CustomChatCompletionContentSimpleVideoParam, part) @@ -739,7 +773,7 @@ def _parse_chat_message_content_mm_part( VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "audio_url", "video_url") + "audio_url", "input_audio", "video_url") def _parse_chat_message_content_parts( @@ -795,7 +829,7 @@ def _parse_chat_message_content_part( # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) - # if part_type is text/refusal/image_url/audio_url/video_url but + # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # content is empty, log a warning and skip if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: logger.warning( @@ -804,18 +838,30 @@ def _parse_chat_message_content_part( return None if part_type in ("text", "refusal"): - return {'type': 'text', 'text': content} if wrap_dicts else content + str_content = cast(str, content) + if wrap_dicts: + return {'type': 'text', 'text': str_content} + else: + return str_content if part_type == "image_url": - mm_parser.parse_image(content) + str_content = cast(str, content) + mm_parser.parse_image(str_content) return {'type': 'image'} if wrap_dicts else None if part_type == "audio_url": - mm_parser.parse_audio(content) + str_content = cast(str, content) + mm_parser.parse_audio(str_content) + return {'type': 'audio'} if wrap_dicts else None + + if part_type == "input_audio": + dict_content = cast(Dict[str, str], content) + mm_parser.parse_input_audio(dict_content) return {'type': 'audio'} if wrap_dicts else None if part_type == "video_url": - mm_parser.parse_video(content) + str_content = cast(str, content) + mm_parser.parse_video(str_content) return {'type': 'video'} if wrap_dicts else None raise NotImplementedError(f"Unknown part type: {part_type}") @@ -840,7 +886,6 @@ def _parse_chat_message_content( content = [ ChatCompletionContentPartTextParam(type="text", text=content) ] - result = _parse_chat_message_content_parts( role, content, # type: ignore From 59c9b6ebeba79b2d744eec86734a7e13b03dcab7 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 16 Dec 2024 22:10:57 -0800 Subject: [PATCH 52/56] [V1][VLM] Proper memory profiling for image language models (#11210) Signed-off-by: Roger Wang Co-authored-by: ywang96 --- vllm/config.py | 8 ++++ vllm/model_executor/models/pixtral.py | 5 ++ vllm/multimodal/registry.py | 23 +++++++-- vllm/v1/core/scheduler.py | 7 ++- vllm/v1/engine/mm_input_mapper.py | 1 + vllm/v1/worker/gpu_model_runner.py | 67 ++++++++++++++++++++++++--- 6 files changed, 98 insertions(+), 13 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9cfd08024ea7b..9ecd3e72afa9f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1280,6 +1280,14 @@ class SchedulerConfig: is_multimodal_model: bool = False + # FIXME(woosuk & ywang96): Below are placeholder values. We need to + # calculate the actual values from the configurations. + # Multimodal encoder run compute budget, only used in V1 + max_num_encoder_input_tokens = 16384 + + # Multimodal encoder cache size, only used in V1 + encoder_cache_size = 16384 + # Whether to perform preemption by swapping or # recomputation. If not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 161d6b41bfa5f..f05ea195e043d 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -245,6 +245,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: # Do not split, return as tensor of shape [1, fs, hs] return image_embeds.unsqueeze(0) + # If the last split index is the last index in image_tokens, we + # ignore it to avoid empty split tensor + if split_indices[-1] == len(image_tokens): + split_indices = split_indices[:-1] + image_embeds = image_embeds.tensor_split(split_indices.cpu()) return image_embeds diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 03f8814a95356..6cd79d414c978 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -200,6 +200,23 @@ def register_max_image_tokens( """ return self.register_max_multimodal_tokens("image", max_mm_tokens) + def get_max_tokens_per_item_by_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: + """ + Get the maximum number of tokens per data item from each modality + for profiling the memory usage of a model. + + Note: + This is currently directly used only in V1. + """ + + return { + key: plugin.get_max_multimodal_tokens(model_config) + for key, plugin in self._plugins.items() + } + def get_max_tokens_by_modality( self, model_config: "ModelConfig", @@ -216,9 +233,9 @@ def get_max_tokens_by_modality( limits_per_plugin = self._limits_by_model[model_config] return { - key: (limits_per_plugin[key] * - plugin.get_max_multimodal_tokens(model_config)) - for key, plugin in self._plugins.items() + key: limits_per_plugin[key] * max_tokens_per_mm_item + for key, max_tokens_per_mm_item in + self.get_max_tokens_per_item_by_modality(model_config).items() } def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index f76364f64033d..178532e477dae 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -73,14 +73,13 @@ def __init__( # NOTE(woosuk): Here, "encoder" includes the vision encoder (and # projector if needed). Currently, we assume that the encoder also # has the Transformer architecture (e.g., ViT). - # FIXME(woosuk): Below are placeholder values. We need to calculate the - # actual values from the configurations. - self.max_num_encoder_input_tokens = 16384 + self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens #noqa: E501 # NOTE(woosuk): For the models without encoder (e.g., text-only models), # the encoder cache will not be initialized and used, regardless of # the cache size. This is because the memory space for the encoder cache # is preallocated in the profiling run. - self.encoder_cache_manager = EncoderCacheManager(cache_size=16384) + self.encoder_cache_manager = EncoderCacheManager( + cache_size=self.scheduler_config.encoder_cache_size) def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index cca27c2218af7..6cdeba6f3f71e 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -54,6 +54,7 @@ def cache_hit_ratio(self, steps): logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", self.mm_cache_hits / self.mm_cache_total) + # TODO: Support modalities beyond image. def process_inputs( self, mm_data: MultiModalDataDict, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 67166fb05085c..c6fab5f05fcb3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -10,15 +10,16 @@ from vllm.config import CompilationLevel, VllmConfig from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context -from vllm.inputs import INPUT_REGISTRY, InputRegistry +from vllm.inputs import INPUT_REGISTRY from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model -from vllm.multimodal import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.sampling_params import SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, LayerBlockType, cdiv, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) +from vllm.v1.engine.mm_input_mapper import MMInputMapperClient from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -35,7 +36,6 @@ def __init__( self, vllm_config: VllmConfig, device: torch.device, - input_registry: InputRegistry = INPUT_REGISTRY, ): self.vllm_config = vllm_config self.model_config = vllm_config.model_config @@ -77,7 +77,12 @@ def __init__( self.hidden_size = model_config.get_hidden_size() # Multi-modal data support - self.input_registry = input_registry + self.input_registry = INPUT_REGISTRY + self.mm_registry = MULTIMODAL_REGISTRY + # NOTE: mm_input_mapper is only used for memory profiling. + self.mm_input_mapper = MMInputMapperClient(self.model_config) + self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens # noqa: E501 + self.encoder_cache_size = self.scheduler_config.encoder_cache_size # Lazy initialization # self.model: nn.Module # Set after load_model @@ -599,8 +604,6 @@ def _dummy_run( return hidden_states def profile_run(self) -> None: - # TODO(woosuk): Profile the max memory usage of the encoder and - # the encoder cache. # use an empty tensor instead of `None`` to force Dynamo to pass # it by reference, rather by specializing on the value `None`. # the `dtype` argument does not matter, and we use `float32` as @@ -612,6 +615,57 @@ def profile_run(self) -> None: torch.tensor([], dtype=torch.float32, device=self.device) for _ in range(self.num_attn_layers) ] + + # Profile with multimodal encoder & encoder cache. + # TODO (ywang96): generalize this beyond image modality since + # mm_input_mapper only supports image inputs. + if self.is_multimodal_model: + + # Create dummy batch of multimodal inputs. + dummy_request_data = self.input_registry.dummy_data_for_profiling( + model_config=self.model_config, + seq_len=self.max_num_tokens, + mm_registry=self.mm_registry, + ) + dummy_mm_data = dummy_request_data.multi_modal_data + dummy_mm_kwargs, _ = self.mm_input_mapper.process_inputs( + mm_data=dummy_mm_data, + mm_hashes=None, + mm_processor_kwargs=None, + precomputed_mm_inputs=None) + + # NOTE: Currently model is profiled with a single non-text + # modality even when it supports multiple. + max_tokens_per_mm_item = max( + self.mm_registry.get_max_tokens_per_item_by_modality( + self.model_config).values()) + + max_num_mm_items = min( + self.max_num_encoder_input_tokens, + self.encoder_cache_size) // max_tokens_per_mm_item + + # Dummy data definition in V0 may contain multiple multimodal items + # (e.g, multiple images) for a single request, therefore here we + # always replicate first item by max_num_mm_items times since in V1 + # they are scheduled to be processed separately. + batched_dummy_mm_inputs = MultiModalKwargs.batch( + [dummy_mm_kwargs[0]] * max_num_mm_items) + batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, device=self.device) + + # Run multimodal encoder. + dummy_encoder_outputs = self.model.get_multimodal_embeddings( + **batched_dummy_mm_inputs) + assert len(dummy_encoder_outputs) == max_num_mm_items, ( + "Expected dimension 0 of encoder outputs to match the number " + f"of multimodal data items: {max_num_mm_items}, got " + f"{len(dummy_encoder_outputs)=} instead. This is most likely " + "due to the 'get_multimodal_embeddings' method of the model " + "not implemented correctly.") + + # Cache the dummy encoder outputs. + self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + # Trigger compilation for general shape. hidden_states = self._dummy_run(self.model, self.max_num_tokens, dummy_kv_caches) @@ -620,6 +674,7 @@ def profile_run(self) -> None: # TODO(woosuk): Consider the memory usage of the sampler. torch.cuda.synchronize() del hidden_states, logits + self.encoder_cache.clear() gc.collect() def capture_model(self) -> None: From e88db68cf5712956f36e77c288699592327b15bd Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 17 Dec 2024 14:11:06 +0800 Subject: [PATCH 53/56] [Platform] platform agnostic for EngineArgs initialization (#11225) Signed-off-by: wangxiyuan --- vllm/engine/arg_utils.py | 8 ++------ vllm/platforms/cpu.py | 3 +++ vllm/platforms/cuda.py | 4 ++++ vllm/platforms/hpu.py | 6 ++++++ vllm/platforms/neuron.py | 6 ++++++ vllm/platforms/openvino.py | 3 +++ vllm/platforms/rocm.py | 4 ++++ vllm/platforms/tpu.py | 5 +++++ vllm/platforms/xpu.py | 4 ++++ 9 files changed, 37 insertions(+), 6 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 06b8542779dc0..f6d276fe7c0c8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -112,9 +112,7 @@ class EngineArgs: pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None - # NOTE(kzawora): default block size for Gaudi should be 128 - # smaller sizes still work, but very inefficiently - block_size: int = 16 if not current_platform.is_hpu() else 128 + block_size: Optional[int] = None enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True @@ -1036,9 +1034,7 @@ def create_engine_config(self, self.enable_prefix_caching = False cache_config = CacheConfig( - # neuron needs block_size = max_model_len - block_size=self.block_size if self.device != "neuron" else - (self.max_model_len if self.max_model_len is not None else 0), + block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, swap_space=self.swap_space, cache_dtype=self.kv_cache_dtype, diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index aad8755d9fcd8..d95a2b4cd5565 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -60,6 +60,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE if kv_cache_space >= 0: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index ae1fd6d5ce068..3c5350b778345 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -137,6 +137,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 2b947d280f9f8..0a44f2b74163a 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -48,6 +48,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + # NOTE(kzawora): default block size for Gaudi should be 128 + # smaller sizes still work, but very inefficiently + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 128 + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on HPU.") diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 86113523385f6..a4bbbd27c8a89 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -33,6 +33,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.worker_cls = \ "vllm.worker.neuron_worker.NeuronWorker" + cache_config = vllm_config.cache_config + if cache_config: + # neuron needs block_size = max_model_len + vllm_config.cache_config.block_size = \ + vllm_config.model_config.max_model_len + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on Neuron.") diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index ccd94e8adb3b1..16eb8dc81efc2 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -87,6 +87,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # check and update cache config ov_core = ov.Core() cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": if not OpenVinoPlatform.is_openvino_cpu(): logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 0133f26a0b1bc..7778b565372cb 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -84,6 +84,10 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 10d874349f36b..77f5c8401424b 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -46,6 +46,11 @@ def inference_mode(cls): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: from vllm.config import CompilationLevel + + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + compilation_config = vllm_config.compilation_config if compilation_config.level == CompilationLevel.NO_COMPILATION: # TPU does not support NO_COMPILATION diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index c20190e789d7e..78e17c2afec65 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -51,6 +51,10 @@ def inference_mode(): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + # check and update model config model_config = vllm_config.model_config if model_config.dtype == torch.bfloat16: From 2bfdbf2a36256bb08547cea3d4ef83b5d27c4b04 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 17 Dec 2024 01:11:33 -0500 Subject: [PATCH 54/56] [V1][Core] Use weakref.finalize instead of atexit (#11242) Signed-off-by: Tyler Michael Smith --- vllm/v1/engine/core_client.py | 13 ++----------- vllm/v1/executor/multiproc_executor.py | 10 +++------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ff25a9b2e9cac..d56fcbdb1e7c4 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,5 +1,5 @@ -import atexit import os +import weakref from typing import List, Optional import msgspec @@ -165,15 +165,9 @@ def __init__( ready_path=ready_path, # type: ignore[misc] **kwargs, ) - atexit.register(self.shutdown) + self._finalizer = weakref.finalize(self, self.shutdown) def shutdown(self): - # During final garbage collection in process shutdown, atexit may be - # None. - if atexit: - # in case shutdown gets called via __del__ first - atexit.unregister(self.shutdown) - # Shut down the zmq context. self.ctx.destroy(linger=0) @@ -197,9 +191,6 @@ def shutdown(self): os.remove(socket_file) self.proc_handle = None - def __del__(self): - self.shutdown() - class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 17441dacdc5cf..128101aa6956d 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,9 +1,9 @@ -import atexit import os import pickle import signal import sys import time +import weakref from dataclasses import dataclass from enum import Enum, auto from multiprocessing.process import BaseProcess @@ -37,7 +37,7 @@ class MultiprocExecutor(Executor): def __init__(self, vllm_config: VllmConfig) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. - atexit.register(self.shutdown) + self._finalizer = weakref.finalize(self, self.shutdown) self.vllm_config = vllm_config self.parallel_config = vllm_config.parallel_config @@ -195,14 +195,10 @@ def _cleanup_sockets(self): os.remove(socket_path) def shutdown(self): - if atexit: - # in case shutdown was called explicitly, we don't need to call it - # again - atexit.unregister(self.shutdown) """Properly shut down the executor and its workers""" if getattr(self, 'shutting_down', False): self.shutting_down = True - for w in self.workers: #TODO: not sure if needed + for w in self.workers: w.worker_response_mq = None self._ensure_worker_termination() From 02222a0256f60319f5bcd56d1d036a943d6334f8 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 16 Dec 2024 22:57:02 -0800 Subject: [PATCH 55/56] [Misc] Kernel Benchmark for `RMSNorm` (#11241) Signed-off-by: Roger Wang Co-authored-by: Xiaoyu Zhang --- benchmarks/kernels/benchmark_rmsnorm.py | 262 ++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 benchmarks/kernels/benchmark_rmsnorm.py diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py new file mode 100644 index 0000000000000..baa5de0fff1bd --- /dev/null +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -0,0 +1,262 @@ +import itertools +from typing import Optional, Tuple, Union + +import torch +import triton +from flashinfer.norm import fused_add_rmsnorm, rmsnorm +from torch import nn + +from vllm import _custom_ops as vllm_ops + + +class HuggingFaceRMSNorm(nn.Module): + + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + orig_dtype = x.dtype + x = x.to(torch.float32) + if residual is not None: + x = x + residual.to(torch.float32) + residual = x.to(orig_dtype) + + variance = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + x = x.to(orig_dtype) * self.weight + if residual is None: + return x + else: + return x, residual + + +def rmsnorm_naive( + x: torch.Tensor, + weight: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, +): + naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) + naive_norm.weight = nn.Parameter(weight) + naive_norm = naive_norm.to(x.device) + + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + if residual is not None: + residual = residual.view(-1, residual.shape[-1]) + + output = naive_norm(x, residual) + + if isinstance(output, tuple): + output = (output[0].view(orig_shape), output[1].view(orig_shape)) + else: + output = output.view(orig_shape) + return output + + +def rmsnorm_flashinfer( + x: torch.Tensor, + weight: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + if residual is not None: + residual = residual.view(-1, residual.shape[-1]) + + if residual is not None: + fused_add_rmsnorm(x, residual, weight, eps) + output = (x, residual) + else: + output = rmsnorm(x, weight, eps) + + if isinstance(output, tuple): + output = (output[0].view(orig_shape), output[1].view(orig_shape)) + else: + output = output.view(orig_shape) + return output + + +def rmsnorm_vllm( + x: torch.Tensor, + weight: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, +): + orig_shape = x.shape + x = x.view(-1, x.shape[-1]) + if residual is not None: + residual = residual.view(-1, residual.shape[-1]) + + if residual is not None: + vllm_ops.fused_add_rms_norm(x, residual, weight, eps) + output = (x, residual) + else: + out = torch.empty_like(x) + vllm_ops.rms_norm(out, x, weight, eps) + output = out + + if isinstance(output, tuple): + output = (output[0].view(orig_shape), output[1].view(orig_shape)) + else: + output = output.view(orig_shape) + return output + + +def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True): + dtype = torch.bfloat16 + x = torch.randn(batch_size, + seq_len, + hidden_size, + dtype=dtype, + device="cuda") + weight = torch.ones(hidden_size, dtype=dtype, device="cuda") + residual = torch.randn_like(x) if use_residual else None + + output_naive = rmsnorm_naive( + x.clone(), weight, + residual.clone() if residual is not None else None) + output_flashinfer = rmsnorm_flashinfer( + x.clone(), weight, + residual.clone() if residual is not None else None) + output_vllm = rmsnorm_vllm( + x.clone(), weight, + residual.clone() if residual is not None else None) + + if use_residual: + output_naive = output_naive[0] + output_flashinfer = output_flashinfer[0] + output_vllm = output_vllm[0] + + print(f"Naive output={output_naive}") + print(f"FlashInfer output={output_flashinfer}") + print(f"VLLM output={output_vllm}") + + if torch.allclose(output_naive, output_flashinfer, atol=1e-2, + rtol=1e-2) and torch.allclose( + output_naive, output_vllm, atol=1e-2, rtol=1e-2): + print("✅ All implementations match") + else: + print("❌ Implementations differ") + + +batch_size_range = [2**i for i in range(0, 7, 2)] +seq_length_range = [2**i for i in range(6, 11, 1)] +head_num_range = [32, 48] +configs = list( + itertools.product(head_num_range, batch_size_range, seq_length_range)) + + +def get_benchmark(use_residual): + + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["head_num", "batch_size", "seq_len"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["huggingface", "flashinfer", "vllm"], + line_names=["HuggingFace", "FlashInfer", "vLLM"], + styles=[("blue", "-"), ("green", "-"), ("red", "-")], + ylabel="us", + plot_name= + f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual", + args={}, + )) + def benchmark(head_num, batch_size, seq_len, provider): + dtype = torch.bfloat16 + hidden_size = head_num * 128 # assuming head_dim = 128 + + x = torch.randn(batch_size, + seq_len, + hidden_size, + dtype=dtype, + device="cuda") + weight = torch.ones(hidden_size, dtype=dtype, device="cuda") + residual = torch.randn_like(x) if use_residual else None + + quantiles = [0.5, 0.2, 0.8] + + if provider == "huggingface": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rmsnorm_naive( + x.clone(), + weight, + residual.clone() if residual is not None else None, + ), + quantiles=quantiles, + ) + elif provider == "flashinfer": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rmsnorm_flashinfer( + x.clone(), + weight, + residual.clone() if residual is not None else None, + ), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rmsnorm_vllm( + x.clone(), + weight, + residual.clone() if residual is not None else None, + ), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms + + return benchmark + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=4, + help="Batch size", + ) + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length", + ) + parser.add_argument( + "--hidden-size", + type=int, + default=4096, + help="Hidden size (2nd dimension) of the sequence", + ) + parser.add_argument("--use-residual", + action="store_true", + help="Whether to use residual connection") + parser.add_argument( + "--save-path", + type=str, + default="./configs/rmsnorm/", + help="Path to save rmsnorm benchmark results", + ) + + args = parser.parse_args() + + # Run correctness test + calculate_diff(batch_size=args.batch_size, + seq_len=args.seq_len, + hidden_size=args.hidden_size, + use_residual=args.use_residual) + + # Get the benchmark function with proper use_residual setting + benchmark = get_benchmark(args.use_residual) + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) From f9ecbb18bf03338a4272c933a49a87021363b048 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 17 Dec 2024 16:37:04 +0800 Subject: [PATCH 56/56] [Misc] Allow passing logits_soft_cap for xformers backend (#11252) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/attention/backends/xformers.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index e2e989efb020c..3e59b3603d2c6 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -17,9 +17,7 @@ is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set) from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) -from vllm.logger import init_logger - -logger = init_logger(__name__) +from vllm.utils import print_warning_once class XFormersBackend(AttentionBackend): @@ -386,8 +384,8 @@ def __init__( raise ValueError( "XFormers does not support block-sparse attention.") if logits_soft_cap is not None: - raise ValueError( - "XFormers does not support attention logits soft capping.") + print_warning_once("XFormers does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale)