Skip to content

Commit

Permalink
Gradlib torch extension cmake (#282)
Browse files Browse the repository at this point in the history
* Converted gradlib into a cmake project whilke using TORCH_LIBRARY binding rather than pybind11

* Made gradlib a vllm _gradlib_C module

* Reusing binding includes from core vllm

* The extension is created by the wrapper

* Remove gradlib mentions from the dockerfile
  • Loading branch information
gshtras authored Nov 15, 2024
1 parent 5362727 commit 48726bf
Show file tree
Hide file tree
Showing 17 changed files with 363 additions and 1,163 deletions.
14 changes: 0 additions & 14 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,8 @@ jobs:
bash -x .github/workflows/scripts/build.sh
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"}
gradlib_wheel_name=$(find gradlib/dist -name "*whl" -print0 | xargs -0 -n 1 basename)
gradlib_asset_name=${gradlib_wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
echo "gradlib_wheel_name=${gradlib_wheel_name}" >> "$GITHUB_ENV"
echo "gradlib_asset_name=${gradlib_asset_name}" >> "$GITHUB_ENV"
- name: Upload vllm Release Asset
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
Expand All @@ -84,13 +80,3 @@ jobs:
asset_path: ./dist/${{ env.wheel_name }}
asset_name: ${{ env.asset_name }}
asset_content_type: application/*
- name: Upload gradlib Release Asset
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./gradlib/dist/${{ env.gradlib_wheel_name }}
asset_name: ${{ env.gradlib_asset_name }}
asset_content_type: application/*

3 changes: 0 additions & 3 deletions .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,3 @@ export MAX_JOBS=32

# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
cd gradlib
$python_executable setup.py bdist_wheel --dist-dir=dist
cd ..
18 changes: 18 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,24 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)

#
# _gradlib_C extension
#
set(VLLM_GRADLIB_EXT_SRC
"csrc/gradlib/torch_bindings.cpp"
"csrc/gradlib/hipbsolgemm.cu"
"csrc/gradlib/rocsolgemm.cu")

define_gpu_extension_target(
_gradlib_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_GRADLIB_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
endif()

# vllm-flash-attn currently only supported on CUDA
Expand Down
12 changes: 4 additions & 8 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ FROM scratch AS export_flash_attn_0
FROM export_flash_attn_${BUILD_FA} AS export_flash_attn

# -----------------------
# vLLM (and gradlib) fetch stages
# vLLM fetch stages
FROM base AS fetch_vllm_0
ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
Expand All @@ -160,7 +160,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm

# -----------------------
# vLLM (and gradlib) build stages
# vLLM build stages
FROM fetch_vllm AS build_vllm
ARG COMMON_WORKDIR
ARG USE_CYTHON
Expand All @@ -184,13 +184,9 @@ RUN cd vllm \
&& python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
# Build gradlib
RUN cd vllm/gradlib \
&& python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/gradlib/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
Expand Down Expand Up @@ -265,7 +261,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
&& make && make install \
&& cd hipMarker && python setup.py install ; fi

# Install vLLM (and gradlib)
# Install vLLM
# Make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
Expand All @@ -277,7 +273,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
*"rocm-6.1"*) \
cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6;; \
*) ;; esac \
&& pip uninstall -y vllm gradlib \
&& pip uninstall -y vllm \
&& pip install *.whl

# Copy over the benchmark scripts as well
Expand Down
45 changes: 12 additions & 33 deletions gradlib/csrc/hipbsolgemm.cu → csrc/gradlib/hipbsolgemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
// __HIP_NO_HALF_CONVERSIONS__ #endif

#include <torch/torch.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/autocast_mode.h>
#include <ATen/cuda/CUDABlas.h>
Expand Down Expand Up @@ -119,7 +118,7 @@ std::map<at::ScalarType, hipDataType> dtype_map{
} // namespace

// find all hipblaslt solutions for given gemm problem
std::vector<int> hipblasLtMatmul_findallsols_wrapper(
std::vector<int64_t> hipblasLtMatmul_findallsols_wrapper(
hipblasLtHandle_t handle, hipblasOperation_t op_A, hipblasOperation_t op_B,
int m, int n, int k, const void* alpha, const void* a, int lda,
const void* b, int ldb, const void* beta, void* c, int ldc,
Expand Down Expand Up @@ -163,7 +162,7 @@ std::vector<int> hipblasLtMatmul_findallsols_wrapper(
handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, op_A, op_B, intype,
intype, outtype, outtype, HIPBLAS_COMPUTE_32F, heuristicResult));

std::vector<int> algoIndex;
std::vector<int64_t> algoIndex;
int returned_algo_count = heuristicResult.size();
// for (int i = 0; i < returnedAlgoCount; i++) {
for (int i = 0; i < returned_algo_count; i++) {
Expand Down Expand Up @@ -290,12 +289,12 @@ hipblasStatus_t hipblasLtMatmul_sol_wrapper(
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
const int solution_index,
at::optional<torch::Tensor> bias = at::nullopt,
at::optional<py::object> out_dtype = at::nullopt,
at::optional<torch::Tensor> scale1 = at::nullopt,
at::optional<torch::Tensor> scale2 = at::nullopt,
at::optional<torch::Tensor> scaleOut = at::nullopt) {
const int64_t solution_index,
at::optional<torch::Tensor> bias,
at::optional<c10::ScalarType> out_dtype,
at::optional<torch::Tensor> scale1,
at::optional<torch::Tensor> scale2,
at::optional<torch::Tensor> scaleOut) {
auto mat1_strides{mat1.strides()};
auto mat2_strides{mat2.strides()};
auto mat1_sizes{mat1.sizes()};
Expand All @@ -309,10 +308,7 @@ torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
"mat1 dim 1 must match mat2 dim 0");

auto inDtype{mat1.options().dtype().toScalarType()};
auto outDtype{
out_dtype.has_value()
? torch::python::detail::py_object_to_dtype(out_dtype.value())
: inDtype};
auto outDtype{out_dtype.has_value() ? out_dtype.value() : inDtype};
auto options{at::TensorOptions().dtype(outDtype).device(at::kCUDA)};
auto result{torch::empty({mat1_sizes[0], mat2_sizes[1]}, options)};

Expand Down Expand Up @@ -392,10 +388,10 @@ torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
}

// find all hipblas solutions and return them to python land
std::vector<int> hipb_findallsols(
std::vector<int64_t> hipb_findallsols(
const torch::Tensor& mat1, const torch::Tensor& mat2,
at::optional<torch::Tensor> bias = at::nullopt,
at::optional<py::object> out_dtype = at::nullopt) {
at::optional<c10::ScalarType> out_dtype = at::nullopt) {
auto mat1_strides{mat1.strides()};
auto mat2_strides{mat2.strides()};
auto mat1_sizes{mat1.sizes()};
Expand All @@ -408,10 +404,7 @@ std::vector<int> hipb_findallsols(
"mat1 dim 1 must match mat2 dim 0");

auto inType{mat1.options().dtype().toScalarType()};
auto outType{
out_dtype.has_value()
? torch::python::detail::py_object_to_dtype(out_dtype.value())
: inType};
auto outType{out_dtype.has_value() ? out_dtype.value() : inType};

auto options{at::TensorOptions().dtype(outType).device(at::kCUDA)};
auto result{torch::empty({mat1_sizes[0], mat2_sizes[1]}, options)};
Expand Down Expand Up @@ -504,17 +497,3 @@ void hipb_destroy_extension() {
// CHECK_HIP_ERROR(hipEventDestroy(start));
// CHECK_HIP_ERROR(hipEventDestroy(stop));
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("hipb_create_extension", &hipb_create_extension, "create_extension");
m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension");
m.def("hipb_mm", &hipb_mm, "hipb_mm", py::arg("mat1"), py::arg("mat2"),
py::arg("solution_index"), py::arg("bias") = at::nullopt,
py::arg("out_dtype") = at::nullopt, py::arg("scale1") = at::nullopt,
py::arg("scale2") = at::nullopt, py::arg("scaleOut") = at::nullopt);
m.def("hipb_findallsols", &hipb_findallsols, "hipb_findallsols",
py::arg("mat1"), py::arg("mat2"), py::arg("bias") = at::nullopt,
py::arg("out_dtype") = at::nullopt);
}
27 changes: 27 additions & 0 deletions csrc/gradlib/ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include <torch/all.h>

void hipb_create_extension();
void hipb_destroy_extension();
torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
const int64_t solution_index,
at::optional<torch::Tensor> bias = at::nullopt,
at::optional<c10::ScalarType> out_dtype = at::nullopt,
at::optional<torch::Tensor> scale1 = at::nullopt,
at::optional<torch::Tensor> scale2 = at::nullopt,
at::optional<torch::Tensor> scaleOut = at::nullopt);

std::vector<int64_t> hipb_findallsols(const torch::Tensor& mat1,
const torch::Tensor& mat2,
at::optional<torch::Tensor> bias,
at::optional<c10::ScalarType> out_dtype);

void rocb_create_extension();
void rocb_destroy_extension();
torch::Tensor RocSolIdxBlas(const torch::Tensor& mat1,
const torch::Tensor& mat2,
const int64_t solution_index);

std::vector<int64_t> RocFindAllSolIdxBlas(const torch::Tensor& mat1,
const torch::Tensor& mat2);
Loading

0 comments on commit 48726bf

Please sign in to comment.