-
Notifications
You must be signed in to change notification settings - Fork 528
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Summary: Pull Request resolved: #3174 Previous VBE on CPU was enabled in lookup_{{ optimizer }}.py. To support MTIA ops, VBE should be done after torch.ops.fbgemm.{{ mdesc }}_embedding_codegen_lookup_{{ optimizer }}_function_pt2. This diff follows the same implementation but enables it C++ so that it goes through the same PT2 pipeline (i.e., lookup -> VBE autograd -> cpu wrapper (*do vbe here*) -> cpu kernel). the call is done Differential Revision: D63410944
- Loading branch information
1 parent
e90603b
commit 2b74f69
Showing
6 changed files
with
221 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
|
||
#include <ATen/ATen.h> | ||
#include <ATen/TypeDefault.h> | ||
// #include <ATen/core/op_registration/op_registration.h> | ||
// #include <torch/script.h> | ||
// #include "fbgemm_gpu/embedding_common.h" | ||
// #include "fbgemm_gpu/utils/dispatch_macros.h" | ||
// #include "fbgemm_gpu/utils/ops_utils.h" | ||
// #include "fbgemm_gpu/utils/tensor_utils.h" | ||
|
||
using Tensor = at::Tensor; | ||
|
||
namespace fbgemm_gpu { | ||
|
||
//////////////////////////////////////////////////////////////////////////////// | ||
// Helper Functions | ||
//////////////////////////////////////////////////////////////////////////////// | ||
|
||
Tensor reshape_vbe_output( | ||
const Tensor& grad_output, | ||
const Tensor& B_offsets, | ||
const Tensor& B_offsets_rank_per_feature, | ||
const Tensor& D_offsets) { | ||
/* FOR CPU VBE to use the same backend */ | ||
const auto T = D_offsets.numel() - 1; | ||
int32_t max_B = 0; | ||
int32_t total_D = 0; | ||
// find max_B, total_D to create output [max_B, total_D] | ||
for (int32_t t = 0; t < T; t++) { | ||
auto b = B_offsets[t + 1].item<int32_t>() - B_offsets[t].item<int32_t>(); | ||
max_B = std::max(max_B, b); | ||
total_D += D_offsets[t + 1].item<int32_t>() - D_offsets[t].item<int32_t>(); | ||
} | ||
auto grad_output_ = at::empty({max_B, total_D}, grad_output.options()); | ||
// for each feature | ||
auto offset = 0; | ||
|
||
const int32_t R = B_offsets_rank_per_feature.size(1) - 1; | ||
for (int32_t r = 0; r < R; r++) { | ||
auto D_offset = 0; | ||
for (int32_t t = 0; t < T; t++) { | ||
const int32_t b_begin = B_offsets_rank_per_feature[t][r].item<int32_t>(); | ||
const int32_t b_end = | ||
B_offsets_rank_per_feature[t][r + 1].item<int32_t>(); | ||
const int32_t D = | ||
D_offsets[t + 1].item<int32_t>() - D_offsets[t].item<int32_t>(); | ||
const int32_t b = b_end - b_begin; | ||
const int32_t num_elm = b * D; | ||
auto values = grad_output.slice(0, offset, offset + num_elm); | ||
values = values.reshape({b, D}); | ||
grad_output_.index_put_( | ||
{at::indexing::Slice(b_begin, b_end), | ||
at::indexing::Slice(D_offset, D_offset + D)}, | ||
values); | ||
D_offset += D; | ||
offset += num_elm; | ||
} | ||
} | ||
return grad_output_; | ||
} | ||
} // namespace fbgemm_gpu |
Oops, something went wrong.