diff --git a/lmdeploy/pytorch/check_env/__init__.py b/lmdeploy/pytorch/check_env/__init__.py index 2b4b3cc52..291b1afb3 100644 --- a/lmdeploy/pytorch/check_env/__init__.py +++ b/lmdeploy/pytorch/check_env/__init__.py @@ -66,6 +66,10 @@ def check_env_triton(device: str): from packaging import version logger = get_logger('lmdeploy') + msg = ( + 'Please ensure that your device is functioning properly with .\n' # noqa: E501 + 'You can verify your environment by running ' + '`python -m lmdeploy.pytorch.check_env.triton_custom_add`.') try: logger.debug('Checking environment.') import torch @@ -87,11 +91,9 @@ def check_env_triton(device: str): 'This Error might caused by mismatching between NVIDIA Driver and nvcc compiler. \n' # noqa: E501 'Try solution https://github.com/triton-lang/triton/issues/1955#issuecomment-1929908209' # noqa: E501 ' or reinstall the driver.') - else: - msg = None _handle_exception(e, 'Triton', logger, msg) except Exception as e: - _handle_exception(e, 'Triton', logger) + _handle_exception(e, 'Triton', logger, msg) if device == 'cuda': device_cap = torch.cuda.get_device_capability() diff --git a/lmdeploy/pytorch/check_env/triton_custom_add.py b/lmdeploy/pytorch/check_env/triton_custom_add.py index ef77fb810..077359110 100644 --- a/lmdeploy/pytorch/check_env/triton_custom_add.py +++ b/lmdeploy/pytorch/check_env/triton_custom_add.py @@ -23,3 +23,11 @@ def custom_add(a, b): grid = (triton.cdiv(size, BLOCK), ) _add_kernel[grid](a, b, c, size, BLOCK=BLOCK) return c + + +if __name__ == '__main__': + a = torch.tensor([1, 2], device='cuda') + b = a.new_tensor([3, 4], device='cuda') + c = custom_add(a, b) + torch.testing.assert_close(c, a + b) + print('Done.') diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py index 7790a44b1..e15ab911f 100644 --- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py +++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py @@ -1153,9 +1153,9 @@ def _get_block_d(Lk): if not is_decoding: BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV = _get_block_d(Lq) if _nv_cap[0] < 8: - BLOCK_M = max(16, min(BLOCK, 8192 // BLOCK_DMODEL)) + BLOCK_M = max(16, 8192 // BLOCK_DMODEL) else: - BLOCK_M = max(16, min(BLOCK, 16384 // BLOCK_DMODEL)) + BLOCK_M = max(16, 16384 // BLOCK_DMODEL) num_warps = 4 num_stages = 2 kv_head = k.shape[h_dim]