Skip to content

Commit

Permalink
Update DecoderSelfAttentionLayer.cc (#269)
Browse files Browse the repository at this point in the history
* Update DecoderSelfAttentionLayer.cc

Need to allow head_size to be 80 for Saleforce CodeGen GPTJ based model.

* Update decoder_masked_multihead_attention.cu

* Update DecoderCrossAttentionLayer.cu

* Update tf_fused_self_multihead_attention_unit_test.py
  • Loading branch information
odellus authored Aug 4, 2022
1 parent 43ae78a commit bb638d8
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,9 @@ void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t&
case 64:
mmha_launch_kernel<T, 64, 64, KERNEL_PARAMS_TYPE>(params, stream);
break;
case 80:
mmha_launch_kernel<T, 80, 128, KERNEL_PARAMS_TYPE>(params, stream);
break;
case 96:
mmha_launch_kernel<T, 96, 128, KERNEL_PARAMS_TYPE>(params, stream);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -711,8 +711,9 @@ DecoderCrossAttentionLayer<T>::DecoderCrossAttentionLayer(size_t max_batch_size,
d_model_(d_model),
q_scaling_(q_scaling)
{
FT_CHECK(size_per_head_ == 32 || size_per_head_ == 64 || size_per_head_ == 96 || size_per_head_ == 128
|| size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256);
FT_CHECK(size_per_head_ == 32 || size_per_head_ == 64 || size_per_head_ == 96 || size_per_head_ == 80
|| size_per_head_ == 128 || size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224
|| size_per_head_ == 256);
}

template<typename T>
Expand Down Expand Up @@ -931,4 +932,4 @@ void DecoderCrossAttentionLayer<T>::forward(std::vector<fastertransformer::Tenso
template class DecoderCrossAttentionLayer<float>;
template class DecoderCrossAttentionLayer<half>;

} // namespace fastertransformer
} // namespace fastertransformer
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,9 @@ DecoderSelfAttentionLayer<T>::DecoderSelfAttentionLayer(size_t max_batch_size,
q_scaling_(q_scaling),
int8_mode_(int8_mode)
{
FT_CHECK(size_per_head_ == 32 || size_per_head_ == 64 || size_per_head_ == 96 || size_per_head_ == 128
|| size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224 || size_per_head_ == 256);
FT_CHECK(size_per_head_ == 32 || size_per_head_ == 64 || size_per_head_ == 80 || size_per_head_ == 96
|| size_per_head_ == 128 || size_per_head_ == 160 || size_per_head_ == 192 || size_per_head_ == 224
|| size_per_head_ == 256);
}

template<typename T>
Expand Down
6 changes: 3 additions & 3 deletions tests/decoding/tf_fused_self_multihead_attention_unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ def test_attn_head_fp16(self):
self.run_attn(4, 128, head, 64, tf.float16)

def test_attn_size_fp32(self):
for size in [32, 64, 96, 128, 160, 192, 224, 256]:
for size in [32, 64, 80, 96, 128, 160, 192, 224, 256]:
tf.reset_default_graph()
self.run_attn(4, 128, 12, size, tf.float32)

def test_attn_size_fp16(self):
for size in [32, 64, 96, 128, 160, 192, 224, 256]:
for size in [32, 64, 80, 96, 128, 160, 192, 224, 256]:
tf.reset_default_graph()
self.run_attn(4, 128, 12, size, tf.float16)

Expand Down Expand Up @@ -171,4 +171,4 @@ def run_attn(self, batch_size, seq_len, head_num, size_per_head, data_type):
assert(v_cache_max_diff < threshold)

if __name__ == "__main__":
unittest.main()
unittest.main()

0 comments on commit bb638d8

Please sign in to comment.