diff --git a/ext/ext/ck3.py b/ext/ext/ck3.py
index aaa47dba1..f3ee1e7b9 100644
--- a/ext/ext/ck3.py
+++ b/ext/ext/ck3.py
@@ -1,5 +1,12 @@
 #!/usr/bin/env python3
 
+'''
+python3 this_script.py -c y3/config.yaml
+
+or
+
+python3 this_script.py y3/config.yaml
+'''
 
 import os
 import re
@@ -14,7 +21,7 @@
 ########################################################################
 
 
-alphabets = {
+rc_alphabets = {
     '0' : 'a', '1' : 'b', '2' : 'c', '3' : 'd', '4' : 'e',
     '5' : 'f', '6' : 'g', '7' : 'h', '8' : 'i', '9' : 'j',
     '10': 'k', '11': 'l', '12': 'm', '13': 'n', '14': 'o',
@@ -22,7 +29,6 @@
     '20': 'u', '21': 'v', '22': 'w', '23': 'x', '24': 'y',
     '25': 'z'}
 
-
 rc_kernel2c = '''
 TEMPLATE_PARAMS            \
 __global__                 \
@@ -40,31 +46,24 @@
     USING_DEVICE_VARIABLES    KERNEL_CONSTEXPR_FLAGS \
     const int ithread = threadIdx.x + blockIdx.x * blockDim.x;
 
-
     DECLARE_ZERO_LOCAL_COUNT    DECLARE_ZERO_LOCAL_ENERGY    DECLARE_ZERO_LOCAL_VIRIAL
     DECLARE_FORCE_I_AND_K       DECLARE_PARAMS_I_AND_K
 
-
     for (int ii = ithread; ii < nexclude; ii += blockDim.x * gridDim.x) {
         KERNEL_SCALED_KLANE    KERNEL_ZERO_LOCAL_FORCE
 
-
         int i = exclude[ii][0];
         int k = exclude[ii][1];
         KERNEL_LOAD_1X_SCALES
 
-
         KERNEL_INIT_EXCLUDE_PARAMS_I_AND_K
 
-
         constexpr bool incl = true;
         KERNEL_SCALED_PAIRWISE_INTERACTION
 
-
         KERNEL_SAVE_LOCAL_FORCE
     }
 
-
     KERNEL_SUM_COUNT    KERNEL_SUM_ENERGY    KERNEL_SUM_VIRIAL
 }
 '''
@@ -91,20 +90,16 @@
     const int nwarp = blockDim.x * gridDim.x / WARP_SIZE;
     const int ilane = threadIdx.x & (WARP_SIZE - 1);
 
-
     DECLARE_ZERO_LOCAL_COUNT    DECLARE_ZERO_LOCAL_ENERGY   DECLARE_ZERO_LOCAL_VIRIAL
     DECLARE_PARAMS_I_AND_K      DECLARE_FORCE_I_AND_K
 
-
     for (int iw = iwarp; iw < nakpl; iw += nwarp) {
         KERNEL_ZERO_LOCAL_FORCE
 
-
         int tri, tx, ty;
         tri = iakpl[iw];
         tri_to_xy(tri, tx, ty);
 
-
         int iid = ty * WARP_SIZE + ilane;
         int atomi = min(iid, n - 1);
         int i = sorted[atomi].unsorted;
@@ -114,7 +109,6 @@
         KERNEL_INIT_PARAMS_I_AND_K
         KERNEL_SYNCWARP
 
-
         KERNEL_LOAD_INFO_VARIABLES
         for (int j = 0; j < WARP_SIZE; ++j) {
             int srclane = (ilane + j) & (WARP_SIZE - 1); \
@@ -124,16 +118,13 @@
             KERNEL_SCALE_1                               \
             KERNEL_FULL_PAIRWISE_INTERACTION
 
-
             iid = __shfl_sync(ALL_LANES, iid, ilane + 1);
             KERNEL_SHUFFLE_PARAMS_I    KERNEL_SHUFFLE_LOCAL_FORCE_I
         }
 
-
         KERNEL_SAVE_LOCAL_FORCE   KERNEL_SYNCWARP
     }
 
-
     KERNEL_SUM_COUNT    KERNEL_SUM_ENERGY    KERNEL_SUM_VIRIAL
 }
 '''
@@ -159,15 +150,12 @@
     const int nwarp = blockDim.x * gridDim.x / WARP_SIZE;
     const int ilane = threadIdx.x & (WARP_SIZE - 1);
 
-
     DECLARE_ZERO_LOCAL_COUNT    DECLARE_ZERO_LOCAL_ENERGY   DECLARE_ZERO_LOCAL_VIRIAL
     DECLARE_PARAMS_I_AND_K      DECLARE_FORCE_I_AND_K
 
-
     for (int iw = iwarp; iw < niak; iw += nwarp) {
         KERNEL_ZERO_LOCAL_FORCE
 
-
         int ty = iak[iw];
         int atomi = ty * WARP_SIZE + ilane;
         int i = sorted[atomi].unsorted;
@@ -176,22 +164,18 @@
         KERNEL_INIT_PARAMS_I_AND_K
         KERNEL_SYNCWARP
 
-
         for (int j = 0; j < WARP_SIZE; ++j) {
             KERNEL_KLANE2          \
             bool incl = atomk > 0; \
             KERNEL_SCALE_1         \
             KERNEL_FULL_PAIRWISE_INTERACTION
 
-
             KERNEL_SHUFFLE_PARAMS_I    KERNEL_SHUFFLE_LOCAL_FORCE_I
         }
 
-
         KERNEL_SAVE_LOCAL_FORCE    KERNEL_SYNCWARP
     }
 
-
     KERNEL_SUM_COUNT    KERNEL_SUM_ENERGY    KERNEL_SUM_VIRIAL
 }
 '''
@@ -220,42 +204,33 @@
     const int nwarp = blockDim.x * gridDim.x / WARP_SIZE;
     const int ilane = threadIdx.x & (WARP_SIZE - 1);
 
-
     DECLARE_ZERO_LOCAL_COUNT    DECLARE_ZERO_LOCAL_ENERGY   DECLARE_ZERO_LOCAL_VIRIAL
     DECLARE_PARAMS_I_AND_K      DECLARE_FORCE_I_AND_K
 
-
     KERNEL_HAS_1X_SCALE
     for (int ii = ithread; ii < nexclude; ii += blockDim.x * gridDim.x) {
         KERNEL_SCALED_KLANE    KERNEL_ZERO_LOCAL_FORCE
 
-
         int i = exclude[ii][0];
         int k = exclude[ii][1];
         KERNEL_LOAD_1X_SCALES
 
-
         KERNEL_INIT_EXCLUDE_PARAMS_I_AND_K
 
-
         constexpr bool incl = true;
         KERNEL_SCALED_PAIRWISE_INTERACTION
 
-
         KERNEL_SAVE_LOCAL_FORCE
     }
     // */
 
-
     for (int iw = iwarp; iw < nakpl; iw += nwarp) {
         KERNEL_ZERO_LOCAL_FORCE
 
-
         int tri, tx, ty;
         tri = iakpl[iw];
         tri_to_xy(tri, tx, ty);
 
-
         int iid = ty * WARP_SIZE + ilane;
         int atomi = min(iid, n - 1);
         int i = sorted[atomi].unsorted;
@@ -265,7 +240,6 @@
         KERNEL_INIT_PARAMS_I_AND_K
         KERNEL_SYNCWARP
 
-
         KERNEL_LOAD_INFO_VARIABLES
         for (int j = 0; j < WARP_SIZE; ++j) {
             int srclane = (ilane + j) & (WARP_SIZE - 1); \
@@ -275,20 +249,16 @@
             KERNEL_SCALE_1                               \
             KERNEL_FULL_PAIRWISE_INTERACTION
 
-
             iid = __shfl_sync(ALL_LANES, iid, ilane + 1);
             KERNEL_SHUFFLE_PARAMS_I    KERNEL_SHUFFLE_LOCAL_FORCE_I
         }
 
-
         KERNEL_SAVE_LOCAL_FORCE   KERNEL_SYNCWARP
     }
 
-
     for (int iw = iwarp; iw < niak; iw += nwarp) {
         KERNEL_ZERO_LOCAL_FORCE
 
-
         int ty = iak[iw];
         int atomi = ty * WARP_SIZE + ilane;
         int i = sorted[atomi].unsorted;
@@ -297,22 +267,18 @@
         KERNEL_INIT_PARAMS_I_AND_K
         KERNEL_SYNCWARP
 
-
         for (int j = 0; j < WARP_SIZE; ++j) {
             KERNEL_KLANE2          \
             bool incl = atomk > 0; \
             KERNEL_SCALE_1         \
             KERNEL_FULL_PAIRWISE_INTERACTION
 
-
             KERNEL_SHUFFLE_PARAMS_I    KERNEL_SHUFFLE_LOCAL_FORCE_I
         }
 
-
         KERNEL_SAVE_LOCAL_FORCE    KERNEL_SYNCWARP
     }
 
-
     KERNEL_SUM_COUNT
     KERNEL_SUM_ENERGY
     KERNEL_SUM_VIRIAL
@@ -570,7 +536,7 @@ def _load_scale_param(ptype:str, stem:str, input:str, separate_scaled_pairwise:b
                 v = ''
                 for i in range(1,len(ss)):
                     idx = ss[i]
-                    al = alphabets[idx]
+                    al = rc_alphabets[idx]
                     if input is None:
                         if not separate_scaled_pairwise:
                             v = v + '{} {}{} = 1;'.format(t, stem, al)
@@ -610,6 +576,8 @@ def _kv(self, k:str):
             return self.config[k]
         else:
             return ''
+
+
     def cudaReplaceDict(self) -> dict:
         d = {}
         config = self.config
diff --git a/src/cu/amoeba/dfield_cu1.cc b/src/cu/amoeba/dfield_cu1.cc
index 2a2fd3fad..e6ffdca35 100644
--- a/src/cu/amoeba/dfield_cu1.cc
+++ b/src/cu/amoeba/dfield_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.0
+// ck.py Version 3.0.2
 template <class ETYP>
 __global__
 void dfield_cu1(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict dpinfo, int nexclude,
diff --git a/src/cu/amoeba/emplar.cu b/src/cu/amoeba/emplar.cu
index 9d07c2ec5..7c919d6f9 100644
--- a/src/cu/amoeba/emplar.cu
+++ b/src/cu/amoeba/emplar.cu
@@ -1,10 +1,10 @@
 #include "ff/amoeba/empole.h"
 #include "ff/amoeba/epolar.h"
 #include "ff/amoeba/induce.h"
-#include "ff/modamoeba.h"
 #include "ff/cumodamoeba.h"
 #include "ff/elec.h"
 #include "ff/image.h"
+#include "ff/modamoeba.h"
 #include "ff/pme.h"
 #include "ff/spatial.h"
 #include "ff/switch.h"
@@ -76,13 +76,11 @@ void rotG2QIMat_v2(const real (&restrict r)[3][3],                   //
 
 template <class Ver, class ETYP>
 __device__
-void pairMplar(                                                           //
-   real r2, real3 dR, real mscale, real dscale, real pscale, real uscale, //
-   real ci, real3 Id, real Iqxx, real Iqxy, real Iqxz, real Iqyy, real Iqyz, real Iqzz, real3 Iud, real3 Iup, real pdi,
-   real pti, //
-   real ck, real3 Kd, real Kqxx, real Kqxy, real Kqxz, real Kqyy, real Kqyz, real Kqzz, real3 Kud, real3 Kup, real pdk,
-   real ptk,            //
-   real f, real aewald, //
+void pairMplar(real r2, real3 dR, real mscale, real dscale, real pscale, real uscale,   //
+   real ci, real3 Id, real Iqxx, real Iqxy, real Iqxz, real Iqyy, real Iqyz, real Iqzz, //
+   real3 Iud, real3 Iup, real pdi, real pti,                                            //
+   real ck, real3 Kd, real Kqxx, real Kqxy, real Kqxz, real Kqyy, real Kqyz, real Kqzz, //
+   real3 Kud, real3 Kup, real pdk, real ptk, real f, real aewald,                       //
    real& restrict frcxi, real& restrict frcyi, real& restrict frczi, real& restrict frcxk, real& restrict frcyk,
    real& restrict frczk, real& restrict trqxi, real& restrict trqyi, real& restrict trqzi, real& restrict trqxk,
    real& restrict trqyk, real& restrict trqzk, real& restrict eo, real& restrict voxx, real& restrict voxy,
@@ -92,74 +90,87 @@ void pairMplar(                                                           //
    constexpr bool do_g = Ver::g;
    constexpr bool do_v = Ver::v;
 
+   if CONSTEXPR (eq<ETYP, EWALD>()) {
+      mscale = 1;
+      dscale = 0.5f;
+      pscale = 0.5f;
+      uscale = 0.5f;
+   } else {
+      dscale *= 0.5f;
+      pscale *= 0.5f;
+      uscale *= 0.5f;
+   }
+
    // a rotation matrix that rotates (xr,yr,zr) to (0,0,r); R G = Q
    real rot[3][3];
    real bn[6];
    real sr3, sr5, sr7, sr9;
    real r = REAL_SQRT(r2);
    real invr1 = REAL_RECIP(r);
-   {
-      real rr2 = invr1 * invr1;
-      real rr1 = invr1;
-      real rr3 = rr1 * rr2;
-      real rr5 = 3 * rr3 * rr2;
-      real rr7 = 5 * rr5 * rr2;
-      real rr9 = 7 * rr7 * rr2;
-      real rr11;
-      if CONSTEXPR (do_g) {
-         rr11 = 9 * rr9 * rr2;
-      }
+   real rr2 = invr1 * invr1;
+   real rr1 = invr1;
+   real rr3 = rr1 * rr2;
+   real rr5 = 3 * rr3 * rr2;
+   real rr7 = 5 * rr5 * rr2;
+   real rr9 = 7 * rr7 * rr2;
+   real rr11;
+   if CONSTEXPR (do_g) {
+      rr11 = 9 * rr9 * rr2;
+   }
 
-      if CONSTEXPR (eq<ETYP, EWALD>()) {
-         if CONSTEXPR (!do_g) {
-            damp_ewald<5>(bn, r, invr1, rr2, aewald);
-         } else {
-            damp_ewald<6>(bn, r, invr1, rr2, aewald);
-         }
-      } else if CONSTEXPR (eq<ETYP, NON_EWALD>()) {
-         bn[0] = rr1;
-         bn[1] = rr3;
-         bn[2] = rr5;
-         bn[3] = rr7;
-         bn[4] = rr9;
-         if CONSTEXPR (do_g) {
-            bn[5] = rr11;
-         }
+   if CONSTEXPR (eq<ETYP, EWALD>()) {
+      if CONSTEXPR (!do_g) {
+         damp_ewald<5>(bn, r, invr1, rr2, aewald);
+      } else {
+         damp_ewald<6>(bn, r, invr1, rr2, aewald);
       }
+   } else if CONSTEXPR (eq<ETYP, NON_EWALD>()) {
+      bn[0] = rr1;
+      bn[1] = rr3;
+      bn[2] = rr5;
+      bn[3] = rr7;
+      bn[4] = rr9;
+      if CONSTEXPR (do_g) {
+         bn[5] = rr11;
+      }
+   }
 
-      // if use_thole
-      real ex3, ex5, ex7, ex9;
-      damp_thole4(r, pdi, pti, pdk, ptk, ex3, ex5, ex7, ex9);
-      sr3 = bn[1] - ex3 * rr3;
-      sr5 = bn[2] - ex5 * rr5;
-      sr7 = bn[3] - ex7 * rr7;
-      sr9 = bn[4] - ex9 * rr9;
-      // end if use_thole
-
-      real3 rotz = invr1 * dR;
-      // pick a random vector as rotx; rotx and rotz cannot be parallel
-      real3 rotx = rotz;
-      if (dR.y != 0 || dR.z != 0)
-         rotx.x += 1;
-      else
-         rotx.y += 1;
-      // Gram–Schmidt process for rotx with respect to rotz
-      rotx -= dot3(rotx, rotz) * rotz;
-      // normalize rotx
-      real invxlen = REAL_RSQRT(dot3(rotx, rotx));
-      rotx = invxlen * rotx;
-      real3 roty = cross(rotz, rotx);
-      rot[0][0] = rotx.x;
-      rot[0][1] = rotx.y;
-      rot[0][2] = rotx.z;
-      rot[1][0] = roty.x;
-      rot[1][1] = roty.y;
-      rot[1][2] = roty.z;
-      rot[2][0] = rotz.x;
-      rot[2][1] = rotz.y;
-      rot[2][2] = rotz.z;
+   // if use_thole
+   real ex3, ex5, ex7, ex9;
+   damp_thole4(r, pdi, pti, pdk, ptk, ex3, ex5, ex7, ex9);
+   sr3 = bn[1] - ex3 * rr3;
+   sr5 = bn[2] - ex5 * rr5;
+   sr7 = bn[3] - ex7 * rr7;
+   sr9 = bn[4] - ex9 * rr9;
+   // end if use_thole
+
+   for (int i = 0; i < 6; ++i) {
+      bn[i] *= mscale;
    }
 
+   real3 rotz = invr1 * dR;
+   // pick a random vector as rotx; rotx and rotz cannot be parallel
+   real3 rotx = rotz;
+   if (dR.y != 0 || dR.z != 0)
+      rotx.x += 1;
+   else
+      rotx.y += 1;
+   // Gram–Schmidt process for rotx with respect to rotz
+   rotx -= dot3(rotx, rotz) * rotz;
+   // normalize rotx
+   real invxlen = REAL_RSQRT(dot3(rotx, rotx));
+   rotx = invxlen * rotx;
+   real3 roty = cross(rotz, rotx);
+   rot[0][0] = rotx.x;
+   rot[0][1] = rotx.y;
+   rot[0][2] = rotx.z;
+   rot[1][0] = roty.x;
+   rot[1][1] = roty.y;
+   rot[1][2] = roty.z;
+   rot[2][0] = rotz.x;
+   rot[2][1] = rotz.y;
+   rot[2][2] = rotz.z;
+
    real3 di, dk;
    rotG2QIVector(rot, Id, di);
    rotG2QIVector(rot, Kd, dk);
@@ -180,17 +191,6 @@ void pairMplar(                                                           //
    real phi2[10] = {0};
    real phi1z[10] = {0};
 
-   if CONSTEXPR (eq<ETYP, EWALD>()) {
-      mscale = 1;
-      dscale = 0.5f;
-      pscale = 0.5f;
-      uscale = 0.5f;
-   } else {
-      dscale *= 0.5f;
-      pscale *= 0.5f;
-      uscale *= 0.5f;
-   }
-
    // C-C
    {
       real coef1 = bn[0];
@@ -238,7 +238,6 @@ void pairMplar(                                                           //
 
    // Q-C and C-Q
    {
-      real coef3 = bn[1];
       real coef5 = bn[2] * r2;
       real coez5 = bn[2] * r;
       real coez7 = bn[3] * r2 * r;
@@ -247,17 +246,14 @@ void pairMplar(                                                           //
       phi2[0] += coef5 * qizz;
       phi1z[0] += -(2 * coez5 - coez7) * qkzz;
       // d2phi_c q
-      phi1[4] += -coef3 * ck;
-      phi1[5] += -coef3 * ck;
-      phi1[6] += -(coef3 - coef5) * ck;
+      // phi1[4]; phi1[5];
+      phi1[6] += coef5 * ck;
       // phi1[7]; phi1[8]; phi1[9];
-      phi2[4] += -coef3 * ci;
-      phi2[5] += -coef3 * ci;
-      phi2[6] += -(coef3 - coef5) * ci;
+      // phi2[4]; phi2[5];
+      phi2[6] += coef5 * ci;
       // phi2[7]; phi2[8]; phi2[9];
-      phi1z[4] += -coez5 * ck;
-      phi1z[5] += -coez5 * ck;
-      phi1z[6] += -(3 * coez5 - coez7) * ck;
+      // phi1z[4]; phi1z[5];
+      phi1z[6] += -(2 * coez5 - coez7) * ck;
       // phi1z[7]; phi1z[8]; phi1z[9];
    }
 
@@ -278,23 +274,20 @@ void pairMplar(                                                           //
       phi1z[2] += 2 * coez7 * qkyz;
       phi1z[3] += (2 * coez7 - coez9) * qkzz;
       // d2phi_d q
-      phi1[4] += coef5 * dk.z;
-      phi1[5] += coef5 * dk.z;
-      phi1[6] += (3 * coef5 - coef7) * dk.z;
+      // phi1[4]; phi1[5];
+      phi1[6] += (2 * coef5 - coef7) * dk.z;
       // phi1[7];
       phi1[8] += 2 * coef5 * dk.x;
       phi1[9] += 2 * coef5 * dk.y;
       //
-      phi2[4] += -coef5 * di.z;
-      phi2[5] += -coef5 * di.z;
-      phi2[6] += -(3 * coef5 - coef7) * di.z;
+      // phi2[4]; phi2[5];
+      phi2[6] += -(2 * coef5 - coef7) * di.z;
       // phi2[7];
       phi2[8] += -2 * coef5 * di.x;
       phi2[9] += -2 * coef5 * di.y;
       //
-      phi1z[4] += -coez7 * dk.z;
-      phi1z[5] += -coez7 * dk.z;
-      phi1z[6] += -(3 * coez7 - coez9) * dk.z;
+      // phi1z[4]; phi1z[5];
+      phi1z[6] += -(2 * coez7 - coez9) * dk.z;
       // phi1z[7];
       phi1z[8] += -2 * coez7 * dk.x;
       phi1z[9] += -2 * coez7 * dk.y;
@@ -310,35 +303,28 @@ void pairMplar(                                                           //
       real coez9 = bn[4] * r2 * r;
       real coez11 = bn[5] * r2 * r2 * r;
       //
-      phi1[4] += 2 * coef5 * qkxx - coef7 * qkzz;
-      phi1[5] += 2 * coef5 * qkyy - coef7 * qkzz;
-      phi1[6] += (2 * coef5 - 5 * coef7 + coef9) * qkzz;
+      phi1[4] += 2 * coef5 * qkxx;
+      phi1[5] += 2 * coef5 * qkyy;
+      phi1[6] += (2 * coef5 - 4 * coef7 + coef9) * qkzz;
       phi1[7] += 4 * coef5 * qkxy;
       phi1[8] += 4 * (coef5 - coef7) * qkxz;
       phi1[9] += 4 * (coef5 - coef7) * qkyz;
       //
-      phi2[4] += 2 * coef5 * qixx - coef7 * qizz;
-      phi2[5] += 2 * coef5 * qiyy - coef7 * qizz;
-      phi2[6] += (2 * coef5 - 5 * coef7 + coef9) * qizz;
+      phi2[4] += 2 * coef5 * qixx;
+      phi2[5] += 2 * coef5 * qiyy;
+      phi2[6] += (2 * coef5 - 4 * coef7 + coef9) * qizz;
       phi2[7] += 4 * coef5 * qixy;
       phi2[8] += 4 * (coef5 - coef7) * qixz;
       phi2[9] += 4 * (coef5 - coef7) * qiyz;
       //
-      phi1z[4] += 2 * coez7 * qkxx + (2 * coez7 - coez9) * qkzz;
-      phi1z[5] += 2 * coez7 * qkyy + (2 * coez7 - coez9) * qkzz;
-      phi1z[6] += (12 * coez7 - 9 * coez9 + coez11) * qkzz;
+      phi1z[4] += 2 * coez7 * qkxx;
+      phi1z[5] += 2 * coez7 * qkyy;
+      phi1z[6] += (10 * coez7 - 8 * coez9 + coez11) * qkzz;
       phi1z[7] += 4 * coez7 * qkxy;
       phi1z[8] += 4 * (3 * coez7 - coez9) * qkxz;
       phi1z[9] += 4 * (3 * coez7 - coez9) * qkyz;
    }
 
-   #pragma unroll
-   for (int i = 0; i < 10; ++i) {
-      phi1[i] *= mscale;
-      phi2[i] *= mscale;
-      phi1z[i] *= mscale;
-   }
-
    if CONSTEXPR (do_e) {
       real e = phi1[0] * ci + phi1[1] * di.x + phi1[2] * di.y + phi1[3] * di.z + phi1[4] * qixx + phi1[5] * qiyy
          + phi1[6] * qizz + phi1[7] * qixy + phi1[8] * qixz + phi1[9] * qiyz;
@@ -418,23 +404,20 @@ void pairMplar(                                                           //
       real coepz7 = pscale * coez7;
       real coepz9 = pscale * coez9;
       // d2phi_u q
-      phi1[4] += coed5 * ukp.z + coep5 * ukd.z;
-      phi1[5] += coed5 * ukp.z + coep5 * ukd.z;
-      phi1[6] += (3 * coed5 - coed7) * ukp.z + (3 * coep5 - coep7) * ukd.z;
+      // phi1[4]; phi1[5];
+      phi1[6] += (2 * coed5 - coed7) * ukp.z + (2 * coep5 - coep7) * ukd.z;
       // phi1[7];
       phi1[8] += 2 * (coed5 * ukp.x + coep5 * ukd.x);
       phi1[9] += 2 * (coed5 * ukp.y + coep5 * ukd.y);
       //
-      phi2[4] += -(coed5 * uip.z + coep5 * uid.z);
-      phi2[5] += -(coed5 * uip.z + coep5 * uid.z);
-      phi2[6] += -(3 * coed5 - coed7) * uip.z - (3 * coep5 - coep7) * uid.z;
+      // phi2[4]; phi2[5];
+      phi2[6] += -(2 * coed5 - coed7) * uip.z - (2 * coep5 - coep7) * uid.z;
       // phi2[7];
       phi2[8] += -2 * (coed5 * uip.x + coep5 * uid.x);
       phi2[9] += -2 * (coed5 * uip.y + coep5 * uid.y);
       //
-      phi1z[4] += -(coedz7 * ukp.z + coepz7 * ukd.z);
-      phi1z[5] += -(coedz7 * ukp.z + coepz7 * ukd.z);
-      phi1z[6] += -(3 * coedz7 - coedz9) * ukp.z - (3 * coepz7 - coepz9) * ukd.z;
+      // phi1z[4]; phi1z[5];
+      phi1z[6] += -(2 * coedz7 - coedz9) * ukp.z - (2 * coepz7 - coepz9) * ukd.z;
       // phi1z[7];
       phi1z[8] += -2 * (coedz7 * ukp.x + coepz7 * ukd.x);
       phi1z[9] += -2 * (coedz7 * ukp.y + coepz7 * ukd.y);
diff --git a/src/cu/amoeba/emplar_cu1.cc b/src/cu/amoeba/emplar_cu1.cc
index 800f5a717..d0637ff71 100644
--- a/src/cu/amoeba/emplar_cu1.cc
+++ b/src/cu/amoeba/emplar_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.0
+// ck.py Version 3.0.2
 template <class Ver, class ETYP>
 __global__
 void emplar_cu1c(TINKER_IMAGE_PARAMS, EnergyBuffer restrict ebuf, VirialBuffer restrict vbuf, grad_prec* restrict gx,
diff --git a/src/cu/amoeba/empole_cu1.cc b/src/cu/amoeba/empole_cu1.cc
index 7ae0a0e23..7876775ca 100644
--- a/src/cu/amoeba/empole_cu1.cc
+++ b/src/cu/amoeba/empole_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.0
+// ck.py Version 3.0.2
 template <class Ver, class ETYP>
 __global__
 void empole_cu1(int n, TINKER_IMAGE_PARAMS, CountBuffer restrict nem, EnergyBuffer restrict em,
diff --git a/src/cu/amoeba/epolar_cu1.cc b/src/cu/amoeba/epolar_cu1.cc
index bcd982cf0..d7a3b8b7a 100644
--- a/src/cu/amoeba/epolar_cu1.cc
+++ b/src/cu/amoeba/epolar_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.0
+// ck.py Version 3.0.2
 template <class Ver, class ETYP>
 __global__
 void epolar_cu1(int n, TINKER_IMAGE_PARAMS, CountBuffer restrict nep, EnergyBuffer restrict ep,
diff --git a/src/cu/amoeba/sparsePrecond_cu1.cc b/src/cu/amoeba/sparsePrecond_cu1.cc
index 92376155b..de44dc1b5 100644
--- a/src/cu/amoeba/sparsePrecond_cu1.cc
+++ b/src/cu/amoeba/sparsePrecond_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.0
+// ck.py Version 3.0.2
 __global__
 void sparsePrecond_cu1(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict uinfo, int nexclude,
    const int (*restrict exclude)[2], const real* restrict exclude_scale, const real* restrict x, const real* restrict y,
diff --git a/src/cu/amoeba/ufield_cu1.cc b/src/cu/amoeba/ufield_cu1.cc
index 96d4c6ff1..462f84511 100644
--- a/src/cu/amoeba/ufield_cu1.cc
+++ b/src/cu/amoeba/ufield_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.0
+// ck.py Version 3.0.2
 template <class ETYP>
 __global__
 void ufield_cu1(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict uinfo, int nexclude,
diff --git a/src/cu/echarge_cu1.cc b/src/cu/echarge_cu1.cc
index c3fa8ef6a..db42dba4c 100644
--- a/src/cu/echarge_cu1.cc
+++ b/src/cu/echarge_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 template <class Ver, class ETYP>
 __global__
 void echarge_cu1(int n, TINKER_IMAGE_PARAMS, CountBuffer restrict nec, EnergyBuffer restrict ec,
diff --git a/src/cu/ehal_cu1.cc b/src/cu/ehal_cu1.cc
index 9422c4dd6..f929eab12 100644
--- a/src/cu/ehal_cu1.cc
+++ b/src/cu/ehal_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 template <class Ver>
 __global__
 void ehal_cu1(int n, TINKER_IMAGE_PARAMS, CountBuffer restrict nev, EnergyBuffer restrict ev, VirialBuffer restrict vev,
diff --git a/src/cu/elj_cu1.cc b/src/cu/elj_cu1.cc
index 6062517f7..690f6d6b1 100644
--- a/src/cu/elj_cu1.cc
+++ b/src/cu/elj_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 template <class Ver>
 __global__
 void elj_cu1(int n, TINKER_IMAGE_PARAMS, CountBuffer restrict nev, EnergyBuffer restrict ev, VirialBuffer restrict vev,
diff --git a/src/cu/hippo/dfieldChgpen_cu1.cc b/src/cu/hippo/dfieldChgpen_cu1.cc
index 2b88291ce..0818fcfd6 100644
--- a/src/cu/hippo/dfieldChgpen_cu1.cc
+++ b/src/cu/hippo/dfieldChgpen_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 template <class ETYP>
 __global__
 void dfieldChgpen_cu1(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict dinfo, int nexclude,
diff --git a/src/cu/hippo/sparsePrecond_cu4.cc b/src/cu/hippo/sparsePrecond_cu4.cc
index 47928ced1..c56d8b748 100644
--- a/src/cu/hippo/sparsePrecond_cu4.cc
+++ b/src/cu/hippo/sparsePrecond_cu4.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 __global__
 void sparsePrecond_cu4(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict winfo, int nexclude,
    const int (*restrict exclude)[2], const real* restrict exclude_scale, const real* restrict x, const real* restrict y,
diff --git a/src/cu/hippo/sparsePrecond_cu6.cc b/src/cu/hippo/sparsePrecond_cu6.cc
index 35d4b2173..806312d4a 100644
--- a/src/cu/hippo/sparsePrecond_cu6.cc
+++ b/src/cu/hippo/sparsePrecond_cu6.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 __global__
 void sparsePrecond_cu6(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict uinfo, int nexclude,
    const int (*restrict exclude)[2], const real* restrict exclude_scale, const real* restrict x, const real* restrict y,
diff --git a/src/cu/hippo/ufieldChgpen_cu1.cc b/src/cu/hippo/ufieldChgpen_cu1.cc
index cdd131354..eafddc639 100644
--- a/src/cu/hippo/ufieldChgpen_cu1.cc
+++ b/src/cu/hippo/ufieldChgpen_cu1.cc
@@ -1,4 +1,4 @@
-// ck.py Version 3.0.1
+// ck.py Version 3.0.2
 template <class ETYP>
 __global__
 void ufieldChgpen_cu1(int n, TINKER_IMAGE_PARAMS, real off, const unsigned* restrict winfo, int nexclude,