From d83204b1461e150201dc19c426642c71730616d7 Mon Sep 17 00:00:00 2001
From: bhsueh <bhsueh@nvidia.com>
Date: Tue, 16 Aug 2022 00:22:04 -0700
Subject: [PATCH] fix: fix bug that tanh ptx require cuda11

---
 docs/bert_guide.md                                  | 3 ---
 docs/decoder_guide.md                               | 3 ---
 src/fastertransformer/kernels/activation_kernels.cu | 7 ++++++-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/bert_guide.md b/docs/bert_guide.md
index f4ae49258..7632e679e 100644
--- a/docs/bert_guide.md
+++ b/docs/bert_guide.md
@@ -157,10 +157,7 @@ For those unable to use the NGC container, to set up the required environment or
 
     You can choose the tensorflow version and python version you want. Here, we list some possible images:
 
-    - `nvcr.io/nvidia/tensorflow:19.07-py2` contains the TensorFlow 1.14 and python 2.7. 
     - `nvcr.io/nvidia/tensorflow:20.12-tf1-py3` contains the TensorFlow 1.15 and python 3.8. 
-    - `nvcr.io/nvidia/pytorch:20.03-py3` contains the PyTorch 1.5.0 and python 3.6
-    - `nvcr.io/nvidia/pytorch:20.07-py3` contains the PyTorch 1.6.0 and python 3.6
     - `nvcr.io/nvidia/pytorch:20.12-py3` contains the PyTorch 1.8.0 and python 3.8
 
     To achieve best performance, we recommend to use the latest image. For example, running image `nvcr.io/nvidia/tensorflow:22.04-tf1-py3` by 
diff --git a/docs/decoder_guide.md b/docs/decoder_guide.md
index ac3e7c2ce..445446370 100644
--- a/docs/decoder_guide.md
+++ b/docs/decoder_guide.md
@@ -156,10 +156,7 @@ For those unable to use the NGC container, to set up the required environment or
 
     You can choose the tensorflow version and python version you want. Here, we list some possible images:
 
-    - `nvcr.io/nvidia/tensorflow:19.07-py2` contains the TensorFlow 1.14 and python 2.7. 
     - `nvcr.io/nvidia/tensorflow:20.12-tf1-py3` contains the TensorFlow 1.15 and python 3.8. 
-    - `nvcr.io/nvidia/pytorch:20.03-py3` contains the PyTorch 1.5.0 and python 3.6
-    - `nvcr.io/nvidia/pytorch:20.07-py3` contains the PyTorch 1.6.0 and python 3.6
     - `nvcr.io/nvidia/pytorch:20.12-py3` contains the PyTorch 1.8.0 and python 3.8
 
     To achieve best performance, we recommend to use the latest image. For example, running image `nvcr.io/nvidia/tensorflow:20.12-tf1-py3` by 
diff --git a/src/fastertransformer/kernels/activation_kernels.cu b/src/fastertransformer/kernels/activation_kernels.cu
index c16a6077d..42e300b85 100644
--- a/src/fastertransformer/kernels/activation_kernels.cu
+++ b/src/fastertransformer/kernels/activation_kernels.cu
@@ -17,6 +17,11 @@
 #include "src/fastertransformer/kernels/activation_kernels.h"
 #include "src/fastertransformer/kernels/bfloat16_fallback_kenrels.cuh"
 #include "src/fastertransformer/utils/cuda_utils.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
 namespace fastertransformer {
 
 __forceinline__ __device__ float copysignf_pos(float a, float b)
@@ -28,7 +33,7 @@ __forceinline__ __device__ float copysignf_pos(float a, float b)
 
 __inline__ __device__ float tanh_opt(float x)
 {
-#if (__CUDA_ARCH__ >= 750)
+#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
     float r;
     asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
     return r;