diff --git a/pyscf_ipu/nanoDFT/compute_eri_utils.py b/pyscf_ipu/nanoDFT/compute_eri_utils.py
index 7619655..921d111 100644
--- a/pyscf_ipu/nanoDFT/compute_eri_utils.py
+++ b/pyscf_ipu/nanoDFT/compute_eri_utils.py
@@ -145,25 +145,92 @@ def prepare_integrals_2_inputs(mol):
     cpu_output   = np.zeros((num_calls, n_eri), dtype=np.float32)
     output_sizes = np.zeros((num_calls, 5))
 
+    
+    screened_indices_s8_4d = []
+    
+    # sample complete column as pattern seed
+    ERI = mol.intor('int2e_sph')
+    nonzero_seed = ERI[N-1, N-1, :, 0] != 0
+    tolerance = 1e-7
+
+    # print('test:')
+    # for k in range(N):
+    #     for l in range(k+1):
+    #         is_nonzero = ~(nonzero_seed[k] ^ nonzero_seed[l]) # not XOR
+    #         print(is_nonzero, end=' ')
+    #     print()
+    # exit()
+
+    # find max value
+    I_max = 0
+    for a in range(N):
+        for b in range(N):
+            abab      = np.abs(ERI[a,b,a,b])
+            if abab > I_max:
+                I_max = abab
+
+    # collect candidate pairs for s8
+    considered_indices = []
+    for a in range(N):
+        for b in range(a, N):
+            abab = np.abs(ERI[a,b,a,b])
+            if abab*I_max>=tolerance:
+                considered_indices.append((a, b)) # collect candidate pairs for s8
+
+    # generate s8 indices
+    for index, ab in enumerate(considered_indices):
+        a, b = ab
+        for cd in considered_indices[index:]:
+            c, d = cd
+            # if b<=d:
+            ok = True
+            if ~(nonzero_seed[d] ^ nonzero_seed[c]):
+                ok = ~(nonzero_seed[b] ^ nonzero_seed[a])
+            else:
+                ok = (nonzero_seed[b] ^ nonzero_seed[a])
+            if ok:
+                screened_indices_s8_4d.append((d, c, b, a))
+
     # Fill input_ijkl and output_sizes with the necessary indices.
     c = 0
     for i in range(n_bas):
             for j in range(n_bas):
                     for k in range(n_bas):
                             for l in range(n_bas):
-                                    # * 8-fold symmetry, k>=l, k>=i>=j,
-                                    if not ( i >= j and k >= l and i*j >= k * l): continue
-
-                                    input_ijkl[c] = [i, j, k, l]
-
                                     di = ao_loc[i+1] - ao_loc[i]
                                     dj = ao_loc[j+1] - ao_loc[j]
                                     dk = ao_loc[k+1] - ao_loc[k]
                                     dl = ao_loc[l+1] - ao_loc[l]
+                                    skip = True # !!!!
+                                    for ni, nj, nk, nl in screened_indices_s8_4d:
+                                        if ao_loc[i] <= ni and ni < ao_loc[i+1] and \
+                                           ao_loc[j] <= nj and nj < ao_loc[j+1] and \
+                                           ao_loc[k] <= nk and nk < ao_loc[k+1] and \
+                                           ao_loc[l] <= nl and nl < ao_loc[l+1]:
+                                            skip = False
+                                            break
+                                    if skip:
+                                        print('skipping', i, j, k, l, di*dj*dk*dl)
+                                        continue
+                                    # * 8-fold symmetry, k>=l, k>=i>=j,
+                                    # if not ( i >= j and k >= l and i*j >= k * l): continue
+
+                                    
+
+                                    # print('>>>>>', i, j, k, l)
+
+                                    
+
+                                    input_ijkl[c] = [i, j, k, l]
 
                                     output_sizes[c] = [di, dj, dk, dl, di*dj*dk*dl]
 
                                     c += 1
+    print('!!! saved', num_calls - c, 'calls i.e.', num_calls, '-', c)
+    num_calls = c
+    input_ijkl = input_ijkl[:num_calls, :]
+    cpu_output = cpu_output[:num_calls, :]
+    output_sizes = output_sizes[:num_calls, :]
 
     # Prepare IPU inputs.
     # Merge all int/float inputs in seperate arrays.
diff --git a/pyscf_ipu/nanoDFT/sparse_symmetric_intor_ERI.py b/pyscf_ipu/nanoDFT/sparse_symmetric_intor_ERI.py
index 345f9c4..032f62b 100644
--- a/pyscf_ipu/nanoDFT/sparse_symmetric_intor_ERI.py
+++ b/pyscf_ipu/nanoDFT/sparse_symmetric_intor_ERI.py
@@ -390,7 +390,8 @@ def ijkl2c(i, j, k, l):
 
     start = time.time()
 
-    mol = pyscf.gto.Mole(atom="".join(f"C 0 {1.54*j} {1.54*i};" for i in range(natm) for j in range(natm))) 
+    # mol = pyscf.gto.Mole(atom="".join(f"C 0 {1.54*j} {1.54*i};" for i in range(natm) for j in range(natm))) 
+    mol = pyscf.gto.Mole(atom="".join(f"C 0 {1.54*i} {1.54*i};" for i in range(natm))) 
     #mol = pyscf.gto.Mole(atom="".join(f"C 0 {15.4*j} {15.4*i};" for i in range(1) for j in range(75))) 
     mol.build()
     N = mol.nao_nr()