-
Notifications
You must be signed in to change notification settings - Fork 326
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[PyTorch] Implement Fp8 padding and unpadding module (#1129)
* [TE/PyTorch][MoE] Add FP8 padding and unpadding module 1. Add multi-tensor padding kernel for FP8 with padding size = 16. 2. Add FP8Padding and Fp8Unpadding module 3. Add Padded GroupedLinear unit tests --------- Signed-off-by: beinggod <[email protected]> Co-authored-by: Phuong Nguyen <[email protected]>
- Loading branch information
Showing
16 changed files
with
995 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
/************************************************************************* | ||
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
* | ||
* See LICENSE for license information. | ||
************************************************************************/ | ||
|
||
#include <cstring> | ||
#include <iomanip> | ||
#include <iostream> | ||
#include <memory> | ||
#include <random> | ||
#include <vector> | ||
#include <cstdio> | ||
|
||
#include <cuda_bf16.h> | ||
#include <cuda_runtime.h> | ||
#include <gtest/gtest.h> | ||
|
||
#include <transformer_engine/padding.h> | ||
#include "../test_common.h" | ||
|
||
using namespace transformer_engine; | ||
|
||
namespace { | ||
|
||
template <typename InputType, typename OutputType> | ||
void compute_ref(const std::vector<std::vector<InputType>>& input_list, | ||
std::vector<std::vector<OutputType>>& output_list, | ||
const std::vector<size_t>& height_list, | ||
const std::vector<size_t>& width_list, | ||
const std::vector<int>& padded_height_list) { | ||
using compute_t = float; | ||
for (size_t tensor_id = 0; tensor_id < input_list.size(); ++tensor_id) { | ||
const auto& input = input_list[tensor_id]; | ||
auto& output = output_list[tensor_id]; | ||
const size_t height = height_list[tensor_id]; | ||
const size_t width = width_list[tensor_id]; | ||
const size_t padded_height = padded_height_list[tensor_id]; | ||
|
||
for (size_t i = 0; i < padded_height; ++i) { | ||
if (i < height) { | ||
for (size_t j = 0; j < width; ++j) { | ||
const compute_t x = static_cast<compute_t>(input[i * width + j]); | ||
const OutputType y = static_cast<OutputType>(x); | ||
output[i * width + j] = y; | ||
} | ||
} else { | ||
for (size_t j = 0; j < width; ++j) { | ||
output[i * width + j] = static_cast<OutputType>(0.f); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename InputType, typename OutputType> | ||
void performTest() { | ||
using namespace test; | ||
|
||
const DType itype = TypeInfo<InputType>::dtype; | ||
const DType otype = TypeInfo<OutputType>::dtype; | ||
const std::vector<std::pair<size_t, size_t>> tensor_dims = {{1,1}, | ||
{1,768}, | ||
{768,1}, | ||
{768,768}, | ||
{43,43}, | ||
{43,256}, | ||
{256,43}, | ||
{256,256}}; | ||
const size_t num_tensors = tensor_dims.size(); | ||
constexpr int align = 16; | ||
|
||
// Buffers for Transformer Engine implementation | ||
std::vector<Tensor> input_list, output_list, output_t_list; | ||
|
||
// Buffers for reference implementation | ||
std::vector<std::vector<InputType>> ref_input_list; | ||
std::vector<std::vector<OutputType>> ref_output_list; | ||
std::vector<size_t> ref_height_list(num_tensors), ref_width_list(num_tensors); | ||
std::vector<int> ref_padded_height_list(num_tensors); | ||
|
||
// Initialize buffers | ||
for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) { | ||
const size_t height = tensor_dims[tensor_id].first; | ||
const size_t width = tensor_dims[tensor_id].second; | ||
const size_t padded_height = (height + align - 1) / align * align; | ||
input_list.emplace_back(Tensor({ height, width }, itype)); | ||
output_list.emplace_back(Tensor({ padded_height, width }, otype)); | ||
|
||
auto& input = input_list.back(); | ||
auto& output = output_list.back(); | ||
fillUniform(&input); | ||
setRandomScale(&output); | ||
|
||
ref_input_list.emplace_back(height*width); | ||
ref_output_list.emplace_back(padded_height*width); | ||
|
||
std::copy(input.cpu_dptr<InputType>(), | ||
input.cpu_dptr<InputType>() + height * width, | ||
ref_input_list.back().begin()); | ||
ref_height_list[tensor_id] = height; | ||
ref_width_list[tensor_id] = width; | ||
ref_padded_height_list[tensor_id] = padded_height; | ||
} | ||
|
||
// Transformer Engine implementation | ||
auto make_nvte_vector = [](std::vector<Tensor>& tensor_list) | ||
-> std::vector<NVTETensor> { | ||
std::vector<NVTETensor> nvte_tensor_list; | ||
for (auto& tensor : tensor_list) { | ||
nvte_tensor_list.emplace_back(tensor.data()); | ||
} | ||
return nvte_tensor_list; | ||
}; | ||
nvte_multi_padding(num_tensors, | ||
make_nvte_vector(input_list).data(), | ||
make_nvte_vector(output_list).data(), | ||
ref_padded_height_list.data(), | ||
0); | ||
cudaDeviceSynchronize(); | ||
auto err = cudaGetLastError(); | ||
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); | ||
|
||
// Reference implementation | ||
compute_ref<InputType, OutputType>(ref_input_list, | ||
ref_output_list, | ||
ref_height_list, | ||
ref_width_list, | ||
ref_padded_height_list); | ||
|
||
// Check correctness | ||
for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) { | ||
auto [atol, rtol] = getTolerances(otype); | ||
compareResults("output", | ||
output_list[tensor_id], | ||
ref_output_list[tensor_id].data(), | ||
atol, rtol); | ||
} | ||
} | ||
|
||
} // namespace | ||
|
||
class MultiPaddingTestSuite | ||
: public ::testing::TestWithParam< | ||
transformer_engine::DType> {}; | ||
|
||
TEST_P(MultiPaddingTestSuite, TestMultiPaddingTranspose) { | ||
using namespace transformer_engine; | ||
using namespace test; | ||
|
||
const DType input_type = GetParam(); | ||
const DType output_type = input_type; | ||
|
||
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType, | ||
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType, | ||
performTest<InputType, OutputType>(); | ||
); | ||
); | ||
} | ||
|
||
|
||
INSTANTIATE_TEST_SUITE_P( | ||
OperatorTest, | ||
MultiPaddingTestSuite, | ||
::testing::ValuesIn(test::all_fp_types), | ||
[](const testing::TestParamInfo<MultiPaddingTestSuite::ParamType>& info) { | ||
std::string name = test::typeName(info.param); | ||
return name; | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.