diff --git a/features/feature_case/ldg/ldg.cu b/features/feature_case/ldg/ldg.cu
new file mode 100644
index 000000000..680f0c5d6
--- /dev/null
+++ b/features/feature_case/ldg/ldg.cu
@@ -0,0 +1,27 @@
+#include "cuda_bf16.h"
+#include "cuda_fp16.h"
+
+__global__ void test_ldg_tex_cache_read(int *deviceArray) {
+  float f1;
+  double d;
+  float2 *f2;
+  __half h1;
+  __half2 *h2;
+  uchar4 u4;
+  ulonglong2 *ull2;
+
+  __ldg(&f1);
+  auto cacheReadD = __ldg(&d);
+  __ldg(f2);
+  auto cacheReadH1 = __ldg(&h1);
+  __ldg(h2);
+  __ldg(&u4);
+  __ldg(ull2);
+}
+
+int main() {
+  int test = 0;
+  test_ldg_tex_cache_read<<<4, 4>>>(&test);
+  cudaDeviceSynchronize();
+  return 0;
+}
diff --git a/features/feature_case/math/math-bf16-conv.cu b/features/feature_case/math/math-bf16-conv.cu
index e9cdb6bd9..be7716df5 100644
--- a/features/feature_case/math/math-bf16-conv.cu
+++ b/features/feature_case/math/math-bf16-conv.cu
@@ -950,46 +950,6 @@ void testLdcvCases(const vector<pair<__nv_bfloat162, int>> &TestCases) {
   }
 }
 
-__global__ void ldg(float *const Result, __nv_bfloat16 *Input1) {
-  *Result = __ldg(Input1);
-}
-
-void testLdgCases(const vector<pair<__nv_bfloat16, int>> &TestCases) {
-  float *Result;
-  cudaMallocManaged(&Result, sizeof(*Result));
-  for (const auto &TestCase : TestCases) {
-    __nv_bfloat16 *Input;
-    cudaMallocManaged(&Input, sizeof(*Input));
-    setValue<<<1, 1>>>(Input, TestCase.first);
-    cudaDeviceSynchronize();
-    ldg<<<1, 1>>>(Result, Input);
-    cudaDeviceSynchronize();
-    checkResult("__ldg", {TestCase.first}, TestCase.first, *Result,
-                TestCase.second);
-  }
-}
-
-__global__ void ldg(float *const Result, __nv_bfloat162 *Input1) {
-  auto ret = __ldg(Input1);
-  Result[0] = __bfloat162float(ret.x);
-  Result[1] = __bfloat162float(ret.y);
-}
-
-void testLdgCases(const vector<pair<__nv_bfloat162, int>> &TestCases) {
-  float *Result;
-  cudaMallocManaged(&Result, 2 * sizeof(*Result));
-  for (const auto &TestCase : TestCases) {
-    __nv_bfloat162 *Input;
-    cudaMallocManaged(&Input, sizeof(*Input));
-    setValue<<<1, 1>>>(Input, TestCase.first);
-    cudaDeviceSynchronize();
-    ldg<<<1, 1>>>(Result, Input);
-    cudaDeviceSynchronize();
-    checkResult("__ldg", {TestCase.first}, TestCase.first,
-                {Result[0], Result[1]}, TestCase.second);
-  }
-}
-
 __global__ void ldlu(float *const Result, __nv_bfloat16 *Input1) {
   *Result = __ldlu(Input1);
 }
@@ -1567,20 +1527,6 @@ int main() {
       {{1, 100.6}, 14},
       {{100.6, 1}, 14},
   });
-  testLdgCases({
-      {-0.3, 16},
-      {-0.4, 16},
-      {0, 37},
-      {0.7, 16},
-      {1, 15},
-      {100.6, 14},
-  });
-  testLdgCases({
-      {{-0.3, -0.4}, 16},
-      {{0, 0.7}, 16},
-      {{1, 100.6}, 14},
-      {{100.6, 1}, 14},
-  });
   testLdluCases({
       {-0.3, 16},
       {-0.4, 16},