-
Notifications
You must be signed in to change notification settings - Fork 305
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[PyTorch] Add FP8 padding and unpaading module
1. Add multi-tensor padding kernel 2. Add FP8Padding and Fp8Unpadding module 3. Add padding grouped linear UT case Signed-off-by: beinggod <[email protected]>
- Loading branch information
beinggod
committed
Aug 30, 2024
1 parent
8ddac3d
commit c5fd1b4
Showing
16 changed files
with
1,009 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
/************************************************************************* | ||
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
* | ||
* See LICENSE for license information. | ||
************************************************************************/ | ||
|
||
#include <cstring> | ||
#include <iomanip> | ||
#include <iostream> | ||
#include <memory> | ||
#include <random> | ||
#include <vector> | ||
#include <cstdio> | ||
|
||
#include <cuda_bf16.h> | ||
#include <cuda_runtime.h> | ||
#include <gtest/gtest.h> | ||
|
||
#include <transformer_engine/padding.h> | ||
#include "../test_common.h" | ||
|
||
using namespace transformer_engine; | ||
|
||
namespace { | ||
|
||
template <typename InputType, typename OutputType> | ||
void compute_ref(const std::vector<std::vector<InputType>>& input_list, | ||
std::vector<std::vector<OutputType>>& output_list, | ||
const std::vector<size_t>& height_list, | ||
const std::vector<size_t>& width_list, | ||
const std::vector<int>& padded_height_list) { | ||
using compute_t = float; | ||
for (size_t tensor_id = 0; tensor_id < input_list.size(); ++tensor_id) { | ||
const auto& input = input_list[tensor_id]; | ||
auto& output = output_list[tensor_id]; | ||
const size_t height = height_list[tensor_id]; | ||
const size_t width = width_list[tensor_id]; | ||
const size_t padded_height = padded_height_list[tensor_id]; | ||
|
||
for (size_t i = 0; i < padded_height; ++i) { | ||
if (i < height) { | ||
for (size_t j = 0; j < width; ++j) { | ||
const compute_t x = static_cast<compute_t>(input[i * width + j]); | ||
const OutputType y = static_cast<OutputType>(x); | ||
output[i * width + j] = y; | ||
} | ||
} else { | ||
for (size_t j = 0; j < width; ++j) { | ||
output[i * width + j] = static_cast<OutputType>(0.f); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename InputType, typename OutputType> | ||
void performTest() { | ||
using namespace test; | ||
|
||
const DType itype = TypeInfo<InputType>::dtype; | ||
const DType otype = TypeInfo<OutputType>::dtype; | ||
const std::vector<std::pair<size_t, size_t>> tensor_dims = {{1,1}, | ||
{1,768}, | ||
{768,1}, | ||
{768,768}, | ||
{43,43}, | ||
{43,256}, | ||
{256,43}, | ||
{256,256}}; | ||
const size_t num_tensors = tensor_dims.size(); | ||
constexpr int align = 16; | ||
|
||
// Buffers for Transformer Engine implementation | ||
std::vector<Tensor> input_list, output_list, output_t_list; | ||
|
||
// Buffers for reference implementation | ||
std::vector<std::vector<InputType>> ref_input_list; | ||
std::vector<std::vector<OutputType>> ref_output_list; | ||
std::vector<size_t> ref_height_list(num_tensors), ref_width_list(num_tensors); | ||
std::vector<int> ref_padded_height_list(num_tensors); | ||
|
||
// Initialize buffers | ||
for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) { | ||
const size_t height = tensor_dims[tensor_id].first; | ||
const size_t width = tensor_dims[tensor_id].second; | ||
const size_t padded_height = (height + align - 1) / align * align; | ||
input_list.emplace_back(Tensor({ height, width }, itype)); | ||
output_list.emplace_back(Tensor({ padded_height, width }, otype)); | ||
|
||
auto& input = input_list.back(); | ||
auto& output = output_list.back(); | ||
fillUniform(&input); | ||
setRandomScale(&output); | ||
|
||
ref_input_list.emplace_back(height*width); | ||
ref_output_list.emplace_back(padded_height*width); | ||
|
||
std::copy(input.cpu_dptr<InputType>(), | ||
input.cpu_dptr<InputType>() + height * width, | ||
ref_input_list.back().begin()); | ||
ref_height_list[tensor_id] = height; | ||
ref_width_list[tensor_id] = width; | ||
ref_padded_height_list[tensor_id] = padded_height; | ||
} | ||
|
||
// Transformer Engine implementation | ||
auto make_nvte_vector = [](std::vector<Tensor>& tensor_list) | ||
-> std::vector<NVTETensor> { | ||
std::vector<NVTETensor> nvte_tensor_list; | ||
for (auto& tensor : tensor_list) { | ||
nvte_tensor_list.emplace_back(tensor.data()); | ||
} | ||
return nvte_tensor_list; | ||
}; | ||
nvte_multi_padding(num_tensors, | ||
make_nvte_vector(input_list).data(), | ||
make_nvte_vector(output_list).data(), | ||
ref_padded_height_list.data(), | ||
0); | ||
cudaDeviceSynchronize(); | ||
auto err = cudaGetLastError(); | ||
ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); | ||
|
||
// Reference implementation | ||
compute_ref<InputType, OutputType>(ref_input_list, | ||
ref_output_list, | ||
ref_height_list, | ||
ref_width_list, | ||
ref_padded_height_list); | ||
|
||
// Check correctness | ||
for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) { | ||
auto [atol, rtol] = getTolerances(otype); | ||
compareResults("output", | ||
output_list[tensor_id], | ||
ref_output_list[tensor_id].data(), | ||
atol, rtol); | ||
} | ||
} | ||
|
||
} // namespace | ||
|
||
class MultiPaddingTestSuite | ||
: public ::testing::TestWithParam< | ||
transformer_engine::DType> {}; | ||
|
||
TEST_P(MultiPaddingTestSuite, TestMultiPaddingTranspose) { | ||
using namespace transformer_engine; | ||
using namespace test; | ||
|
||
const DType input_type = GetParam(); | ||
const DType output_type = input_type; | ||
|
||
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(input_type, InputType, | ||
TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(output_type, OutputType, | ||
performTest<InputType, OutputType>(); | ||
); | ||
); | ||
} | ||
|
||
|
||
INSTANTIATE_TEST_SUITE_P( | ||
OperatorTest, | ||
MultiPaddingTestSuite, | ||
::testing::ValuesIn(test::all_fp_types), | ||
[](const testing::TestParamInfo<MultiPaddingTestSuite::ParamType>& info) { | ||
std::string name = test::typeName(info.param); | ||
return name; | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.