From 0e2347c8b14988cadfddcb17c36b8ad0c1d6a31f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 27 Jun 2024 13:07:59 -0500
Subject: [PATCH 01/78] compat check

---
 pyscf_version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyscf_version.txt b/pyscf_version.txt
index 1f60d3ed..bd0fe629 100644
--- a/pyscf_version.txt
+++ b/pyscf_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf.git@6512c8b042139ac21355a2657f98535474ddabdc
+git+https://github.com/pyscf/pyscf.git@d488cb7552130481407dbf698a9231459c21f291

From e7a24e40ef44907535608e79f4cbb2c44ef1f380 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 28 Jun 2024 14:59:49 -0500
Subject: [PATCH 02/78] lasscf_async keyframe comparison fns

orbital_block_svd and count_common_orbitals
---
 my_pyscf/mcscf/lasscf_async/keyframe.py | 123 +++++++++++++++++++++---
 1 file changed, 108 insertions(+), 15 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index e2ca8684..73548687 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -1,4 +1,5 @@
 import numpy as np
+from pyscf.lib import logger
 from scipy import linalg
 
 class LASKeyframe (object):
@@ -75,6 +76,42 @@ def approx_keyframe_ovlp (las, kf1, kf2):
             if mo_ovlp deviates significantly from 1.
     '''
 
+    u, svals, vh = orbital_block_svd (las, kf1, kf2)
+    mo_ovlp = np.prod (svals)
+
+    ci_ovlp = []
+    for ifrag, (fcibox, c1_r, c2_r) in enumerate (zip (las.fciboxes, kf1.ci, kf2.ci)):
+        nlas, nelelas = las.ncas_sub[ifrag], las.nelecas_sub[ifrag]
+        i = las.ncore + sum (las.ncas_sub[:ifrag])
+        j = i + las.ncas_sub[ifrag]
+        umat = u[i:j,i:j] @ vh[i:j,i:j]
+        c1_r = fcibox.states_transform_ci_for_orbital_rotation (c1_r, nlas, nelelas, umat)
+        ci_ovlp.append ([abs (c1.conj ().ravel ().dot (c2.ravel ()))
+                         for c1, c2 in zip (c1_r, c2_r)])
+
+    return mo_ovlp, ci_ovlp
+    
+def orbital_block_svd (las, kf1, kf2):
+    '''Evaluate the block-SVD of the orbitals of two keyframes. Blocks are inactive (core), active
+    of each fragment, and virtual.
+
+    Args:
+        las : object of :class:`LASCINoSymm`
+        kf1 : object of :class:`LASKeyframe`
+        kf2 : object of :class:`LASKeyframe`
+
+    Returns:
+        u : array of shape (nao,nmo)
+            Block-diagonal unitary matrix of orbital rotations for kf1, keeping each subspace
+            unchanged but aligning the orbitals to identify the spaces the two keyframes have in
+            common, if any
+        svals : array of shape (nmo)
+            Singular values.
+        vh: array of shape (nmo,nao)
+            Transpose of block-diagonal unitary matrix of orbital rotations for kf2, keeping each
+            subspace unchanged but aligning the orbitals to identify the spaces the two keyframes
+            have in common, if any
+    '''
     nao, nmo = kf1.mo_coeff.shape    
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
@@ -84,15 +121,11 @@ def approx_keyframe_ovlp (las, kf1, kf2):
     mo1 = kf1.mo_coeff[:,:ncore]
     mo2 = kf2.mo_coeff[:,:ncore]
     s1 = mo1.conj ().T @ s0 @ mo2
-    u, svals, vh = linalg.svd (s1)
-    mo_ovlp = np.prod (svals) # inactive orbitals
-    mo1 = kf1.mo_coeff[:,nocc:]
-    mo2 = kf2.mo_coeff[:,nocc:]
-    s1 = mo1.conj ().T @ s0 @ mo2
-    u, svals, vh = linalg.svd (s1)
-    mo_ovlp *= np.prod (svals) # virtual orbitals
+    u_core, svals_core, vh_core = linalg.svd (s1)
 
-    ci_ovlp = []
+    u = [u_core,]
+    svals = [svals_core,]
+    vh = [vh_core,]
     for ifrag, (fcibox, c1_r, c2_r) in enumerate (zip (las.fciboxes, kf1.ci, kf2.ci)):
         nlas, nelelas = las.ncas_sub[ifrag], las.nelecas_sub[ifrag]
         i = ncore + sum (las.ncas_sub[:ifrag])
@@ -100,12 +133,72 @@ def approx_keyframe_ovlp (las, kf1, kf2):
         mo1 = kf1.mo_coeff[:,i:j]
         mo2 = kf2.mo_coeff[:,i:j]
         s1 = mo1.conj ().T @ s0 @ mo2
-        u, svals, vh = linalg.svd (s1)
-        mo_ovlp *= np.prod (svals) # ifrag active orbitals
-        c1_r = fcibox.states_transform_ci_for_orbital_rotation (c1_r, nlas, nelelas, u @ vh)
-        ci_ovlp.append ([abs (c1.conj ().ravel ().dot (c2.ravel ()))
-                         for c1, c2 in zip (c1_r, c2_r)])
+        u_i, svals_i, vh_i = linalg.svd (s1)
+        u.append (u_i)
+        svals.append (svals_i)
+        vh.append (vh_i)
+
+    mo1 = kf1.mo_coeff[:,nocc:]
+    mo2 = kf2.mo_coeff[:,nocc:]
+    s1 = mo1.conj ().T @ s0 @ mo2
+    u_virt, svals_virt, vh_virt = linalg.svd (s1)
+    u.append (u_virt)
+    svals.append (svals_virt)
+    vh.append (vh_virt)
+
+    u = linalg.block_diag (*u)
+    svals = np.concatenate (svals)
+    vh = linalg.block_diag (*vh)
+
+    return u, svals, vh
+
+def count_common_orbitals (las, kf1, kf2, verbose=None):
+    '''Evaluate how many orbitals in each subspace two keyframes have in common
+
+    Args:
+        las : object of :class:`LASCINoSymm`
+        kf1 : object of :class:`LASKeyframe`
+        kf2 : object of :class:`LASKeyframe`
+
+    Kwargs:
+        verbose: integer or None
+
+    Returns:
+        ncommon_core : int
+        ncommon_active : list of length nfrags
+        ncommon_virt : int
+    '''
+    if verbose is None: verbose=las.verbose
+    ncore, ncas = las.ncore, las.ncas
+    nocc = ncore + ncas
+    nvirt = nmo - nocc
+    log = logger.new_logger (las, verbose)
+
+    u, svals, vh = orbital_block_svd (las, kf1, kf2)
+
+    fmt_str = '{:s} orbitals: {:d}/{:d} in common'
+    def _count (lbl, i, j):
+        ncommon = np.count_nonzero (np.isclose (svals[i:j], 1))
+        log.info (fmt_string.format (lbl, ncommon, j-i))
+        return ncommon
+
+    ncommon_core = _count ('Inactive', 0, ncore)
+    ncommon_active = []
+    j_list = np.cumsum (las.ncas_sub) + ncore
+    i_list = j_list - np.asarray (las.ncas_sub)
+    for ifrag, (i, j) in enumerate (zip (i_list, j_list)):
+        lbl = 'Active {:d}'.format (ifrag)
+        ncommon_active.append (_count (lbl, i, j))
+    ncommon_virt = _count ('Virtual', nocc, nmo)
+
+    return ncommon_core, ncommon_active, ncommon_virt
+
+
+
+
+
+
+
+
 
-    return mo_ovlp, ci_ovlp
-    
 

From 271d1f22899ee49170169eb850547149779f0eda Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 28 Jun 2024 15:57:20 -0500
Subject: [PATCH 03/78] lasscf_async keyframe comparisons printout

---
 my_pyscf/mcscf/lasscf_async/keyframe.py     |  3 ++-
 my_pyscf/mcscf/lasscf_async/lasscf_async.py | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 73548687..b2fb4fa0 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -169,6 +169,7 @@ def count_common_orbitals (las, kf1, kf2, verbose=None):
         ncommon_virt : int
     '''
     if verbose is None: verbose=las.verbose
+    nao, nmo = kf1.mo_coeff.shape    
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
     nvirt = nmo - nocc
@@ -179,7 +180,7 @@ def count_common_orbitals (las, kf1, kf2, verbose=None):
     fmt_str = '{:s} orbitals: {:d}/{:d} in common'
     def _count (lbl, i, j):
         ncommon = np.count_nonzero (np.isclose (svals[i:j], 1))
-        log.info (fmt_string.format (lbl, ncommon, j-i))
+        log.info (fmt_str.format (lbl, ncommon, j-i))
         return ncommon
 
     ncommon_core = _count ('Inactive', 0, ncore)
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 76ec8696..d69e194e 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -1,12 +1,13 @@
+import itertools
 import numpy as np
 from scipy import linalg
 from pyscf import lib
 from pyscf.mcscf import mc1step
 from mrh.my_pyscf.mcscf import lasci, lasscf_sync_o0
 from mrh.my_pyscf.mcscf.lasscf_guess import interpret_frags_atoms
+from mrh.my_pyscf.mcscf.lasscf_async import keyframe
 from mrh.my_pyscf.mcscf.lasscf_async.split import get_impurity_space_constructor
 from mrh.my_pyscf.mcscf.lasscf_async.crunch import get_impurity_casscf
-from mrh.my_pyscf.mcscf.lasscf_async.keyframe import LASKeyframe
 from mrh.my_pyscf.mcscf.lasscf_async.combine import combine_o0
 
 def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
@@ -56,6 +57,16 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             impurity.kernel ()
             kf2_list.append (impurity._push_keyframe (kf1))
 
+        # EXPERIMENTAL: examining differences in keyframes
+        for i in range (len (kf2_list)):
+            kfi = kf2_list[i]
+            log.info ('Comparing reference keyframe to fragment %d', i)
+            keyframe.count_common_orbitals (las, kf1, kfi)
+        for i, j in itertools.combinations (range (len (kf2_list)), 2):
+            kfi, kfj = kf2_list[i], kf2_list[j]
+            log.info ('Comparing keyframes for fragments %d and %d:', i, j)
+            keyframe.count_common_orbitals (las, kfi, kfj)
+
         # 3. Combine from fragments. TODO: smaller chunks instead of one whole-molecule function
         kf1 = combine_o0 (las, kf2_list)
 
@@ -140,7 +151,7 @@ class LASSCFNoSymm (lasci.LASCINoSymm):
     def get_keyframe (self, mo_coeff=None, ci=None):
         if mo_coeff is None: mo_coeff=self.mo_coeff
         if ci is None: ci=self.ci
-        return LASKeyframe (self, mo_coeff, ci)
+        return keyframe.LASKeyframe (self, mo_coeff, ci)
     as_scanner = mc1step.as_scanner
     def set_fragments_(self, frags_atoms=None, mo_coeff=None, localize_init_guess=True,
                        frags_by_AOs=False, **kwargs):

From cc8f642ed164134f02828f4c86182e39ed086e14 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 9 Jul 2024 11:27:56 -0500
Subject: [PATCH 04/78] issue #105 debug oversight

compare eris correctly
---
 my_pyscf/mcscf/las_ao2mo.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/las_ao2mo.py b/my_pyscf/mcscf/las_ao2mo.py
index 6293c357..4855f251 100644
--- a/my_pyscf/mcscf/las_ao2mo.py
+++ b/my_pyscf/mcscf/las_ao2mo.py
@@ -56,7 +56,9 @@ def get_h2eff_df (las, mo_coeff):
     if mem_enough_int:
         eri = lib.tag_array (eri, bmPu=np.concatenate (bmuP, axis=-1).transpose (0,2,1))
     if las.verbose > lib.logger.DEBUG:
-        eri_comp = las.with_df.ao2mo (mo, compact=True)
+        eri_comp = las.with_df.ao2mo (mo_coeff, compact=True)
+        eri_comp = eri_comp[:,ncore:nocc,ncore:nocc,ncore:nocc]
+        eri_comp = lib.pack_tril (eri_comp.reshape (nmo*ncas, ncas, ncas)).reshape (nmo, -1)
         lib.logger.debug(las,"CDERI two-step error: {}".format(linalg.norm(eri-eri_comp)))
     return eri
 

From 97b913b41422dfbc8142f46e45f32b8328668f2f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 9 Jul 2024 16:36:18 -0500
Subject: [PATCH 05/78] pyscf compatibility check

---
 pyscf_version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyscf_version.txt b/pyscf_version.txt
index bd0fe629..d45effe2 100644
--- a/pyscf_version.txt
+++ b/pyscf_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf.git@d488cb7552130481407dbf698a9231459c21f291
+git+https://github.com/pyscf/pyscf.git@beb7b1bcb40dec578392322d20126826f2d3e6ad

From 461b1efaa29fd6b128a5722f4e5cd9740e3d4b50 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 11 Jul 2024 16:58:39 -0500
Subject: [PATCH 06/78] quicksave

---
 my_pyscf/mcscf/lasscf_async/keyframe.py | 42 +++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index b2fb4fa0..3143cf9c 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -194,12 +194,50 @@ def _count (lbl, i, j):
 
     return ncommon_core, ncommon_active, ncommon_virt
 
+def get_kappa (las, kf1, kf2):
+    '''Decompose unitary matrix of orbital rotations between two keyframes as
 
+    | U11 U12 U13 ... |       | 0   -K'21 -K'31 ... |   | R11 0   0   ... |
+    | U21 U22 U23 ... | = exp | K21 0     -K'32 ... | * | 0   R22 0   ... |
+    | U31 U32 U33 ... |       | K31 K32   0     ... |   | 0   0   R33 ... |
+    | ... ... ... ... |       | ... ...   ...   ... |   | ... ... ... ... |
 
+    Where the first block is inactive orbitals, the next blocks are the active
+    orbitals of individual fragments, and the final block is virtual orbitals.
+    The lower triangle of the skew-symmetrix matrix gives the amplitudes of
+    the unitary group generators which transform the orbitals of kf1 into those
+    of kf2 after a decanonicalization of the latter given by the block-diagonal
+    matrix.
 
+    Args:
+        las : object of :class:`LASCINoSymm`
+        kf1 : object of :class:`LASKeyframe`
+        kf2 : object of :class:`LASKeyframe`
 
+    Returns:
+        kappa : ndarray of shape (nmo, nmo)
+            Skew-symmetric matrix of orbital rotation amplitudes whose lower
+            triangle gives the unitary generator amplitudes for transforming
+            from kf1 to kf2 (before orbital rotation given by ur
+        ur : ndarray of shape (nmo, nmo)
+            Block-diagonal unitary matrix. The overall unitary transformation
+            to go from the orbitals of kf1 to those of kf2 is expm(kappa)@ur
+    '''
+    mo1 = kf1.mo_coeff
+    mo2 = kf2.mo_coeff
+    s0 = las._scf.get_ovlp ()
+    ovlp = mo1.conj ().T @ s0 @ mo2
 
+    nao, nmo = mo1.shape
+    ncore, ncas = las.ncore, las.ncas
+    nocc = ncore + ncas
+    nvirt = nmo - nocc
+    nblk = [ncore,] + list (las.ncas_sub) + [nvirt,]
+    blkoff = np.cumsum (nblk)
 
-
-
+    kappa_raw = linalg.expm (ovlp)
+    idx_diag = np.zeros ((nmo,nmo), dtype=False)
+    skewerr = linalg.norm (kappa_raw + kappa_raw.T)
+    ur = np.eye (nmo)
+    
 

From 7e7850f11980179db42855c9d1a608cdfb886656 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 12 Jul 2024 12:18:39 -0500
Subject: [PATCH 07/78] lasscf_async keyframe get_kappa function

---
 my_pyscf/mcscf/lasscf_async/keyframe.py     | 58 +++++++++++++++++----
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  2 +
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 3143cf9c..2469e67c 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -197,6 +197,8 @@ def _count (lbl, i, j):
 def get_kappa (las, kf1, kf2):
     '''Decompose unitary matrix of orbital rotations between two keyframes as
 
+      <kf1|kf2>         = exp ( kappa )               *   rmat
+
     | U11 U12 U13 ... |       | 0   -K'21 -K'31 ... |   | R11 0   0   ... |
     | U21 U22 U23 ... | = exp | K21 0     -K'32 ... | * | 0   R22 0   ... |
     | U31 U32 U33 ... |       | K31 K32   0     ... |   | 0   0   R33 ... |
@@ -204,10 +206,16 @@ def get_kappa (las, kf1, kf2):
 
     Where the first block is inactive orbitals, the next blocks are the active
     orbitals of individual fragments, and the final block is virtual orbitals.
-    The lower triangle of the skew-symmetrix matrix gives the amplitudes of
-    the unitary group generators which transform the orbitals of kf1 into those
-    of kf2 after a decanonicalization of the latter given by the block-diagonal
-    matrix.
+    The skew-symmetric kappa matrix has zero diagonal blocks because the LASSCF
+    energy is invariant to those degrees of freedom, but it is not generally
+    possible to transform between any arbitrary pair of orbital bases without
+    them, so instead they are factorized via repeated BCH expansions:
+
+    kappa = lim n->infty kappa[n]
+    rmat = ... @ rmat[3] @ rmat[2] @ rmat[1] 
+
+    log ( ovlp[n-1] ) = kappa[n] + log ( rmat[n] )
+    ovlp[n] = ovlp[n-1] @ rmat[n].conj ().T
 
     Args:
         las : object of :class:`LASCINoSymm`
@@ -219,10 +227,16 @@ def get_kappa (las, kf1, kf2):
             Skew-symmetric matrix of orbital rotation amplitudes whose lower
             triangle gives the unitary generator amplitudes for transforming
             from kf1 to kf2 (before orbital rotation given by ur
-        ur : ndarray of shape (nmo, nmo)
+        rmat : ndarray of shape (nmo, nmo)
             Block-diagonal unitary matrix. The overall unitary transformation
             to go from the orbitals of kf1 to those of kf2 is expm(kappa)@ur
     '''
+    log = logger.new_logger (las, las.verbose)
+
+    # Initial guess for rmat using orbital_block_svd
+    u, svals, vh = orbital_block_svd (las, kf1, kf2)
+    rmat = u @ vh
+
     mo1 = kf1.mo_coeff
     mo2 = kf2.mo_coeff
     s0 = las._scf.get_ovlp ()
@@ -235,9 +249,35 @@ def get_kappa (las, kf1, kf2):
     nblk = [ncore,] + list (las.ncas_sub) + [nvirt,]
     blkoff = np.cumsum (nblk)
 
-    kappa_raw = linalg.expm (ovlp)
-    idx_diag = np.zeros ((nmo,nmo), dtype=False)
-    skewerr = linalg.norm (kappa_raw + kappa_raw.T)
-    ur = np.eye (nmo)
+    kappa = linalg.logm (ovlp @ rmat.conj ().T)
+    rmat1 = np.zeros_like (kappa)
+    skewerr = linalg.norm (kappa + kappa.T) 
+    if (skewerr/nmo)>1e-8:
+        log.error ('get_kappa matrix logarithm failed (skewerr = %e)', skewerr)
+    max_cycle = 100
+    log.debug ('get_kappa: iterating BCH expansion until maximum diagonal element is less than %e',
+               100*skewerr)
+    for it in range (max_cycle):
+        diagerr = 0
+        for i in range (len (nblk)):
+            i1 = blkoff[i]
+            i0 = i1 - nblk[i]
+            diagerr = max (diagerr, np.amax (np.abs (kappa[i0:i1,i0:i1])))
+            rmat1[i0:i1,i0:i1] = linalg.expm (kappa[i0:i1,i0:i1])
+        log.debug ('get_kappa iter %d diagerr: %e', it, diagerr)
+        if diagerr < 100*skewerr: break
+        rmat = rmat1 @ rmat
+        kappa = linalg.logm (ovlp @ rmat.conj ().T)
+    if diagerr > 100*skewerr:
+        log.warn ('get_kappa maxiter')
     
+    umat = linalg.expm (kappa) @ rmat
+    finalerr = linalg.norm ((umat.conj ().T @ ovlp) - np.eye (nmo))
+    log.debug ('get_kappa final error = %e (skewerr = %e)', finalerr, skewerr)
+
+    return kappa, rmat
+
+
+
+
 
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index d69e194e..ad2b7cb5 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -62,10 +62,12 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             kfi = kf2_list[i]
             log.info ('Comparing reference keyframe to fragment %d', i)
             keyframe.count_common_orbitals (las, kf1, kfi)
+            keyframe.get_kappa (las, kf1, kfi)
         for i, j in itertools.combinations (range (len (kf2_list)), 2):
             kfi, kfj = kf2_list[i], kf2_list[j]
             log.info ('Comparing keyframes for fragments %d and %d:', i, j)
             keyframe.count_common_orbitals (las, kfi, kfj)
+            keyframe.get_kappa (las, kfi, kfj)
 
         # 3. Combine from fragments. TODO: smaller chunks instead of one whole-molecule function
         kf1 = combine_o0 (las, kf2_list)

From dfa4fd4f4b7af29a534c2f3919fa16d9e0a99530 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 12 Jul 2024 12:30:45 -0500
Subject: [PATCH 08/78] get_kappa proper parameters

---
 my_pyscf/mcscf/lasscf_async/keyframe.py | 45 +++++++++++++++----------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 2469e67c..def9cfe3 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -237,27 +237,33 @@ def get_kappa (las, kf1, kf2):
     u, svals, vh = orbital_block_svd (las, kf1, kf2)
     rmat = u @ vh
 
-    mo1 = kf1.mo_coeff
-    mo2 = kf2.mo_coeff
-    s0 = las._scf.get_ovlp ()
-    ovlp = mo1.conj ().T @ s0 @ mo2
+    # Iteration parameters
+    tol_strict = 1e-8
+    tol_target = 1e-10
+    max_cycle = 100
 
-    nao, nmo = mo1.shape
+    # Indexing
+    nao, nmo = kf1.mo_coeff.shape
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
     nvirt = nmo - nocc
     nblk = [ncore,] + list (las.ncas_sub) + [nvirt,]
     blkoff = np.cumsum (nblk)
 
-    kappa = linalg.logm (ovlp @ rmat.conj ().T)
-    rmat1 = np.zeros_like (kappa)
-    skewerr = linalg.norm (kappa + kappa.T) 
-    if (skewerr/nmo)>1e-8:
-        log.error ('get_kappa matrix logarithm failed (skewerr = %e)', skewerr)
-    max_cycle = 100
+    # Iteration
+    mo1 = kf1.mo_coeff
+    mo2 = kf2.mo_coeff
+    s0 = las._scf.get_ovlp ()
+    ovlp = mo1.conj ().T @ s0 @ mo2
+    rmat1 = np.zeros_like (rmat)
+    lasterr = 1
     log.debug ('get_kappa: iterating BCH expansion until maximum diagonal element is less than %e',
-               100*skewerr)
+               tol_target)
     for it in range (max_cycle):
+        kappa = linalg.logm (ovlp @ rmat.conj ().T)
+        skewerr = linalg.norm (kappa + kappa.T) 
+        if (skewerr/nmo)>tol_strict:
+            log.error ('get_kappa matrix logarithm failed (skewerr = %e)', skewerr)
         diagerr = 0
         for i in range (len (nblk)):
             i1 = blkoff[i]
@@ -265,15 +271,20 @@ def get_kappa (las, kf1, kf2):
             diagerr = max (diagerr, np.amax (np.abs (kappa[i0:i1,i0:i1])))
             rmat1[i0:i1,i0:i1] = linalg.expm (kappa[i0:i1,i0:i1])
         log.debug ('get_kappa iter %d diagerr: %e', it, diagerr)
-        if diagerr < 100*skewerr: break
+        if (diagerr < tol_target) or ((diagerr<tol_strict) and (diagerr>lasterr)): break
+        # If you run this for infinity cycles it will always diverge. I'd like to get to
+        # 1e-10 but if 1e-8 is the best it can do then it should stop there.
+        lasterr = diagerr
         rmat = rmat1 @ rmat
-        kappa = linalg.logm (ovlp @ rmat.conj ().T)
-    if diagerr > 100*skewerr:
-        log.warn ('get_kappa maxiter')
+    if diagerr > tol_strict:
+        log.warn ('get_kappa iteration failed after %d cycles with err = %e',
+                  it, diagerr)
     
+    # Final check
     umat = linalg.expm (kappa) @ rmat
     finalerr = linalg.norm ((umat.conj ().T @ ovlp) - np.eye (nmo))
-    log.debug ('get_kappa final error = %e (skewerr = %e)', finalerr, skewerr)
+    log.debug ('get_kappa final error = %e', finalerr)
+    assert (finalerr < tol_strict)
 
     return kappa, rmat
 

From 56b90f80db8e920b3ff6b9547cff0c5b621b923b Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 12 Jul 2024 12:44:20 -0500
Subject: [PATCH 09/78] docstring oops

---
 my_pyscf/mcscf/lasscf_async/keyframe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index def9cfe3..8d6d347c 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -226,10 +226,10 @@ def get_kappa (las, kf1, kf2):
         kappa : ndarray of shape (nmo, nmo)
             Skew-symmetric matrix of orbital rotation amplitudes whose lower
             triangle gives the unitary generator amplitudes for transforming
-            from kf1 to kf2 (before orbital rotation given by ur
+            from kf1 to kf2
         rmat : ndarray of shape (nmo, nmo)
             Block-diagonal unitary matrix. The overall unitary transformation
-            to go from the orbitals of kf1 to those of kf2 is expm(kappa)@ur
+            to go from the orbitals of kf1 to those of kf2 is expm(kappa)@rmat
     '''
     log = logger.new_logger (las, las.verbose)
 

From 090e4c2805bae9cab2010963efa9ac2c856111b9 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 12 Jul 2024 15:24:57 -0500
Subject: [PATCH 10/78] get_kappa docstring notes

---
 my_pyscf/mcscf/lasscf_async/keyframe.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 8d6d347c..71f52fc6 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -214,9 +214,14 @@ def get_kappa (las, kf1, kf2):
     kappa = lim n->infty kappa[n]
     rmat = ... @ rmat[3] @ rmat[2] @ rmat[1] 
 
-    log ( ovlp[n-1] ) = kappa[n] + log ( rmat[n] )
+    ovlp[0] = (kf1.mo_coeff|kf2.mo_coeff)
+    log (ovlp[n-1]) = kappa[n] + log (rmat[n])
     ovlp[n] = ovlp[n-1] @ rmat[n].conj ().T
 
+    The first-order correction to log (rmat[n]) vanishes because the commutator
+    [kappa, log (rmat)] diagonal blocks are zero. So this should converge fast.
+    If it doesn't, maybe try solving for rmat[n] to second order in each cycle?
+
     Args:
         las : object of :class:`LASCINoSymm`
         kf1 : object of :class:`LASKeyframe`

From aba161d11a5b2df7191ebd59885652eaaeb63daf Mon Sep 17 00:00:00 2001
From: Bhavnesh Jangid <bhuvnesh.jangid10@gmail.com>
Date: Fri, 12 Jul 2024 16:21:22 -0500
Subject: [PATCH 11/78] Specific State PDFT Calculation Only

---
 my_pyscf/mcpdft/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/my_pyscf/mcpdft/__init__.py b/my_pyscf/mcpdft/__init__.py
index beb29124..600b5038 100644
--- a/my_pyscf/mcpdft/__init__.py
+++ b/my_pyscf/mcpdft/__init__.py
@@ -76,7 +76,7 @@ def _laspdftEnergy(mc_class, mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, DoLASSI
 
 
 def _lassipdftEnergy(mc_class, mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, DoLASSI=False, ncore=None, spin_sub=None,
-                   frozen=None, **kwargs):
+                   frozen=None, states=None,**kwargs):
 
     from mrh.my_pyscf.lassi import lassi
 
@@ -89,7 +89,7 @@ def _lassipdftEnergy(mc_class, mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, DoLAS
     mc1 = mc_class(mf_or_mol, ncas_sub, nelecas_sub, ncore=ncore, spin_sub=spin_sub)
 
     from mrh.my_pyscf.mcpdft.laspdft import get_mcpdft_child_class
-    mc2 = get_mcpdft_child_class(mc1, ot, DoLASSI=DoLASSI, **kwargs)
+    mc2 = get_mcpdft_child_class(mc1, ot, DoLASSI=DoLASSI,states=states, **kwargs)
 
     if mc0 is not None:
         mc2.mo_coeff = mc_or_mf_or_mol.mo_coeff.copy()
@@ -108,10 +108,10 @@ def LASSCFPDFT(mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub,  ncore=None, spin_sub
                           spin_sub=spin_sub, frozen=frozen, **kwargs)
 
 def LASSIPDFT(mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, ncore=None, spin_sub=None, frozen=None,
-               **kwargs):
+        states=None, **kwargs):
     from mrh.my_pyscf.mcscf.lasscf_o0 import LASSCF
     return _lassipdftEnergy(LASSCF,  mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, DoLASSI=True, ncore=ncore,
-                          spin_sub=spin_sub, frozen=frozen, **kwargs)
+                          spin_sub=spin_sub, frozen=frozen, states=states, **kwargs)
 
 
 LASSCF = LASSCFPDFT

From 07a8982bf08fbf340e23afe0505c6c1a0c22ab17 Mon Sep 17 00:00:00 2001
From: Bhavnesh Jangid <bhuvnesh.jangid10@gmail.com>
Date: Fri, 12 Jul 2024 16:21:34 -0500
Subject: [PATCH 12/78] Specific State PDFT Calculation Only

---
 my_pyscf/mcpdft/laspdft.py | 60 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index e9a4e42b..f3566e05 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -1,4 +1,5 @@
 from pyscf import ao2mo, lib
+from pyscf.mcscf.addons import StateAverageMCSCFSolver
 import numpy as np
 import copy
 from scipy import linalg
@@ -32,21 +33,76 @@ def get_h2eff(self, mo_coeff=None):
             eri = ao2mo.full(self.mol, mo_coeff, verbose=self.verbose,
                                 max_memory=self.max_memory)
         return eri
+
+    def compute_pdft_energy_(self, mo_coeff=None, ci=None, ot=None, otxc=None,
+                             grids_level=None, grids_attr=None, **kwargs):
+        '''Compute the MC-PDFT energy(ies) (and update stored data)
+        with the MC-SCF wave function fixed. '''
+        '''
+        Instead of finding the energies of all the states, this can allow
+        to take state number for which you want to add the PDFT corrections
+        '''
+        if mo_coeff is not None: self.mo_coeff = mo_coeff
+        if ci is not None: self.ci = ci
+        if ot is not None: self.otfnal = ot
+        if otxc is not None: self.otxc = otxc
+        if grids_attr is None: grids_attr = {}
+        if grids_level is not None: grids_attr['level'] = grids_level
+        if len(grids_attr): self.grids.__dict__.update(**grids_attr)
+        nroots = getattr(self.fcisolver, 'nroots', 1)
+        if isinstance(nroots, list):
+            epdft = [self.energy_tot(mo_coeff=self.mo_coeff, ci=self.ci, state=ix,
+                                 logger_tag='MC-PDFT state {}'.format(ix))
+                                for ix in nroots]
+        else:
+            epdft = [self.energy_tot(mo_coeff=self.mo_coeff, ci=self.ci, state=ix,
+                                 logger_tag='MC-PDFT state {}'.format(ix))
+                                for ix in range(nroots)]
+
+        self.e_ot = [e_ot for e_tot, e_ot in epdft]
         
-def get_mcpdft_child_class(mc, ot, DoLASSI=False,  **kwargs):
+        if isinstance(self, StateAverageMCSCFSolver):
+            e_states = [e_tot for e_tot, e_ot in epdft]
+            try:
+                self.e_states = e_states
+            except AttributeError as e:
+                self.fcisolver.e_states = e_states
+                assert (self.e_states is e_states), str(e)
+            # TODO: redesign this. MC-SCF e_states is stapled to
+            # fcisolver.e_states, but I don't want MS-PDFT to be
+            # because that makes no sense
+            self.e_tot = np.dot(e_states, self.weights)
+            e_states = self.e_states
+        elif (len(nroots) > 1 if isinstance(nroots, list) else nroots > 1):
+            self.e_tot = [e_tot for e_tot, e_ot in epdft]
+            e_states = self.e_tot
+        else:  # nroots==1 not StateAverage class
+            self.e_tot, self.e_ot = epdft[0]
+            e_states = [self.e_tot]
+        return self.e_tot, self.e_ot, e_states
+
+def get_mcpdft_child_class(mc, ot, DoLASSI=False,states=None,**kwargs):
     mc_doc = (mc.__class__.__doc__ or 'No docstring for MC-SCF parent method')
    
     class PDFT(_LASPDFT, mc.__class__):
         __doc__= mc_doc + '\n\n' + _LASPDFT.__doc__
         _mc_class = mc.__class__
         setattr(_mc_class, 'DoLASSI', None)
+        setattr(_mc_class, 'states', None)
 
         def get_h2eff(self, mo_coeff=None):
             if self._in_mcscf_env: return mc.__class__.get_h2eff(self, mo_coeff=mo_coeff)
             else: return _LASPDFT.get_h2eff(self, mo_coeff=mo_coeff)
         
+        def compute_pdft_energy_(self, mo_coeff=None, ci=None, ot=None, otxc=None,
+                             grids_level=None, grids_attr=None, states=states, **kwargs):
+            return _LASPDFT.compute_pdft_energy_(self, mo_coeff=mo_coeff, ci=ci, ot=ot, otxc=otxc,
+                             grids_level=grids_level, grids_attr=grids_attr, **kwargs)
+
         if DoLASSI:  _mc_class.DoLASSI = True
         else: _mc_class.DoLASSI = False
+        
+        if states is not None: _mc_class.states=states
 
         if _mc_class.DoLASSI:
             # This code doesn't seem efficent, have to calculate the casdm1 and casdm2 in different functions.
@@ -69,7 +125,7 @@ def optimize_mcscf_(self, mo_coeff=None, ci0=None, **kwargs):
             Has the same calling signature as the parent kernel method. '''
             with _mcscf_env(self):
                 if self.DoLASSI:
-                    self.fcisolver.nroots = len(self.e_states)
+                    self.fcisolver.nroots = len(self.e_states) if self.states is None else self.states
                     self.e_states = self.e_roots
                 else:
                     self.e_mcscf, self.e_cas, self.ci, self.mo_coeff, self.mo_energy = \

From 2dbbf3bf9bd8c81efbc1dba364e0f73c10eeffb5 Mon Sep 17 00:00:00 2001
From: Bhavnesh Jangid <bhuvnesh.jangid10@gmail.com>
Date: Fri, 12 Jul 2024 16:23:04 -0500
Subject: [PATCH 13/78] Specific State PDFT for LASSI, example updated

---
 examples/laspdft/c2h4n4_si_laspdft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/laspdft/c2h4n4_si_laspdft.py b/examples/laspdft/c2h4n4_si_laspdft.py
index 11f55410..d60dd138 100755
--- a/examples/laspdft/c2h4n4_si_laspdft.py
+++ b/examples/laspdft/c2h4n4_si_laspdft.py
@@ -30,7 +30,7 @@
 lsi.kernel()
 
 # LASSI-PDFT
-mc = mcpdft.LASSI(lsi, 'tPBE', (3, 3), ((2,1),(1,2)))
+mc = mcpdft.LASSI(lsi, 'tPBE', (3, 3), ((2,1),(1,2)), states=[0, 1])
 mc.kernel() 
 
 # CASCI-PDFT in las orbitals

From 97c0ce18774bb2fd4000d42a1019b4c4c25c7073 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 12 Jul 2024 16:41:59 -0500
Subject: [PATCH 14/78] lasscf_async keyframe safe_svd

---
 my_pyscf/mcscf/lasscf_async/keyframe.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 71f52fc6..e3813453 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -1,6 +1,7 @@
 import numpy as np
 from pyscf.lib import logger
 from scipy import linalg
+from mrh.util.la import safe_svd_warner
 
 class LASKeyframe (object):
     '''Shallow struct for various intermediates. DON'T put complicated code in here Matt!!!'''
@@ -112,6 +113,8 @@ def orbital_block_svd (las, kf1, kf2):
             subspace unchanged but aligning the orbitals to identify the spaces the two keyframes
             have in common, if any
     '''
+    log = logger.new_logger (las, las.verbose)
+    svd = safe_svd_warner (log.warn)
     nao, nmo = kf1.mo_coeff.shape    
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
@@ -121,7 +124,7 @@ def orbital_block_svd (las, kf1, kf2):
     mo1 = kf1.mo_coeff[:,:ncore]
     mo2 = kf2.mo_coeff[:,:ncore]
     s1 = mo1.conj ().T @ s0 @ mo2
-    u_core, svals_core, vh_core = linalg.svd (s1)
+    u_core, svals_core, vh_core = svd (s1)
 
     u = [u_core,]
     svals = [svals_core,]
@@ -133,7 +136,7 @@ def orbital_block_svd (las, kf1, kf2):
         mo1 = kf1.mo_coeff[:,i:j]
         mo2 = kf2.mo_coeff[:,i:j]
         s1 = mo1.conj ().T @ s0 @ mo2
-        u_i, svals_i, vh_i = linalg.svd (s1)
+        u_i, svals_i, vh_i = svd (s1)
         u.append (u_i)
         svals.append (svals_i)
         vh.append (vh_i)
@@ -141,7 +144,7 @@ def orbital_block_svd (las, kf1, kf2):
     mo1 = kf1.mo_coeff[:,nocc:]
     mo2 = kf2.mo_coeff[:,nocc:]
     s1 = mo1.conj ().T @ s0 @ mo2
-    u_virt, svals_virt, vh_virt = linalg.svd (s1)
+    u_virt, svals_virt, vh_virt = svd (s1)
     u.append (u_virt)
     svals.append (svals_virt)
     vh.append (vh_virt)

From f5e1f4cf56ac355fc72c1c8b3da691fed1b25f99 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 15 Jul 2024 12:22:29 -0500
Subject: [PATCH 15/78] lasscf_async.combine impweights

---
 my_pyscf/mcscf/lasscf_async/combine.py      | 18 ++++++++++++++++++
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index a868b3de..770810bf 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -137,4 +137,22 @@ def combine_o0 (las, kf2_list):
     kf1 = relax (las, kf1)
     return kf1
 
+def impweights (las, mo_coeff, impurities):
+    '''Compute the weights of each MO in mo_coeff on the various impurities.
+
+    Args:
+        las : object of :class:`LASCINoSymm`
+        mo_coeff : ndarray of shape (nao,nmo)
+        impurities: list of length nfrag of objects of :class:`ImpurityCASSCF`
+
+    Returns:
+        weights: ndarray of shape (nmo, nfrag)
+    '''
+    smoH = mo_coeff.conj ().T @ las._scf.get_ovlp ()
+    weights = []
+    for imp in impurities:
+        a = smoH @ imp.mol.get_imporb_coeff ()
+        weights.append ((a @ a.conj ().T).diagonal ())
+    return np.stack (weights, axis=1)
+
 
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index ad2b7cb5..ab446249 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -5,7 +5,7 @@
 from pyscf.mcscf import mc1step
 from mrh.my_pyscf.mcscf import lasci, lasscf_sync_o0
 from mrh.my_pyscf.mcscf.lasscf_guess import interpret_frags_atoms
-from mrh.my_pyscf.mcscf.lasscf_async import keyframe
+from mrh.my_pyscf.mcscf.lasscf_async import keyframe, combine
 from mrh.my_pyscf.mcscf.lasscf_async.split import get_impurity_space_constructor
 from mrh.my_pyscf.mcscf.lasscf_async.crunch import get_impurity_casscf
 from mrh.my_pyscf.mcscf.lasscf_async.combine import combine_o0

From 3f7208236b00699d32f04138bd2f3ab808f99ede Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 15 Jul 2024 15:30:37 -0500
Subject: [PATCH 16/78] lasscf_async impweights on keyframe

---
 my_pyscf/mcscf/lasscf_async/crunch.py   | 6 +++++-
 my_pyscf/mcscf/lasscf_async/keyframe.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 8e107c55..7345060d 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -357,6 +357,11 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         imporb_coeff = self.mol.get_imporb_coeff ()
         mo_self = imporb_coeff @ mo_coeff
 
+        # impweights for combining updates
+        s0 = las._scf.get_ovlp ()
+        ovlp = kf1.mo_coeff.conj ().T @ s0 @ imporb_coeff
+        kf2.impweights = (ovlp @ ovlp.conj ().T).diagonal ()
+
         # active orbital part should be easy
         kf2.ci[self._ifrag] = self.ci
         las = self.mol._las
@@ -367,7 +372,6 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         kf2.mo_coeff[:,i:j] = mo_self[:,k:l]
 
         # Unentangled inactive orbitals
-        s0 = las._scf.get_ovlp ()
         ncore_unent = las.ncore - self.ncore
         assert (ncore_unent>=0), '{} {}'.format (las.ncore, self.ncore)
         if las.ncore:
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index e3813453..98c607de 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -11,6 +11,7 @@ def __init__(self, las, mo_coeff, ci):
         self.mo_coeff = mo_coeff
         self.ci = ci
         self._dm1s = self._veff = self._fock1 = self._h1eff_sub = self._h2eff_sub = None
+        self.impweights = None
 
     @property
     def dm1s (self):

From 74e7b651dac8859e8c75053ebbb72030c09884c5 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 15 Jul 2024 15:54:20 -0500
Subject: [PATCH 17/78] safety commit

---
 my_pyscf/mcscf/lasscf_async/combine.py | 54 ++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 770810bf..e193a341 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -5,6 +5,7 @@
 from pyscf.lo import orth
 from pyscf.scf.rohf import get_roothaan_fock
 from mrh.my_pyscf.mcscf import lasci, _DFLASCI
+from mrh.my_pyscf.mcscf.lasscf_async import keyframe
 
 # TODO: symmetry
 def orth_orb (las, kf2_list):
@@ -155,4 +156,57 @@ def impweights (las, mo_coeff, impurities):
         weights.append ((a @ a.conj ().T).diagonal ())
     return np.stack (weights, axis=1)
 
+def combine_impweighted (las, kf1, kf2, kf_ref):
+    '''Combine two keyframes (without relaxing the active orbitals) by weighting the kappa matrices
+    with respect to a third reference keyframe by the impweights parameter
 
+    Args:
+        las : object of :class:`LASCINoSymm`
+        kf1 : object of :class:`LASKeyframe`
+        kf2 : object of :class:`LASKeyframe`
+        kf_ref : object of :class:`LASKeyframe`
+            Reference point for the kappa matrices
+
+    Returns:
+        kf3 : object of :class:`LASKeyframe`
+    '''
+    kf3 = kf_ref.copy ()
+    w1 = np.add.outer (kf1.impweights, kf2.impweights)
+    w2 = np.add.outer (kf1.impweights, kf2.impweights)
+    kappa1, rmat1 = keyframe.get_kappa (las, kf1, kf_ref)
+    kappa2, rmat2 = keyframe.get_kappa (las, kf2, kf_ref)
+    kappa = (w1*kappa1) + (w2*kappa2)
+    rmat = np.eye (kf_ref.mo_coeff.shape[1])
+
+    # Figure out which fragments are associated w the two keyframes
+    offs = np.cumsum (las.ncas_sub) + ncore
+    kf1_frags = []
+    kf2_frags = []
+    for i in range (len (las.nfrags)):
+        i1 = offs[i]
+        i0 = i1 - las.ncas_sub[i]
+        # kf1
+        w = sum (kf1.impweights[i0:i1]) / las.ncas_sub[i]
+        if np.isclose (w, 1):
+            kf3.ci[i] = kf1.ci[i]
+            rmat[i0:i1,i0:i1] = rmat1[i0:i1,i0:i1]
+        elif abs (w) > 1e-4:
+            raise RuntimeError ("fragment split between impurities? ({})".format (w))
+        # kf2
+        w = sum (kf2.impweights[i0:i1]) / las.ncas_sub[i]
+        if np.isclose (w, 1):
+            kf3.ci[i] = kf2.ci[i]
+            rmat[i0:i1,i0:i1] = rmat2[i0:i1,i0:i1]
+        elif abs (w) > 1e-4:
+            raise RuntimeError ("fragment split between impurities? ({})".format (w))
+
+    # set orbitals and impweights
+    umat = linalg.expm (kappa) @ rmat
+    kf3.mo_coeff = kf_ref.mo_coeff @ umat
+    kf3.impweights = kf1.impweights + kf2.impweights
+    
+    return kf3
+
+
+
+    

From 7b86a4e5fdf8e5210c66aa6bd28efe11d4d18d32 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 15 Jul 2024 15:57:13 -0500
Subject: [PATCH 18/78] proper weighting

---
 my_pyscf/mcscf/lasscf_async/combine.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index e193a341..514757b7 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -175,7 +175,9 @@ def combine_impweighted (las, kf1, kf2, kf_ref):
     w2 = np.add.outer (kf1.impweights, kf2.impweights)
     kappa1, rmat1 = keyframe.get_kappa (las, kf1, kf_ref)
     kappa2, rmat2 = keyframe.get_kappa (las, kf2, kf_ref)
-    kappa = (w1*kappa1) + (w2*kappa2)
+    denom = w1 + w2
+    denom[denom<1e-8] = 1e-8
+    kappa = ((w1*kappa1) + (w2*kappa2)) / denom
     rmat = np.eye (kf_ref.mo_coeff.shape[1])
 
     # Figure out which fragments are associated w the two keyframes

From 8444221fe6fa62381a467bd4edd8a8093bd90bac Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 15 Jul 2024 16:51:41 -0500
Subject: [PATCH 19/78] oops

---
 my_pyscf/mcscf/lasscf_async/crunch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 7345060d..75c87537 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -356,6 +356,7 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         kf2 = kf1.copy ()
         imporb_coeff = self.mol.get_imporb_coeff ()
         mo_self = imporb_coeff @ mo_coeff
+        las = self.mol._las
 
         # impweights for combining updates
         s0 = las._scf.get_ovlp ()
@@ -364,7 +365,6 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
 
         # active orbital part should be easy
         kf2.ci[self._ifrag] = self.ci
-        las = self.mol._las
         i = las.ncore + sum (las.ncas_sub[:self._ifrag])
         j = i + las.ncas_sub[self._ifrag]
         k = self.ncore

From 028fdd28cc59b35a17d44cce212ed73dbb354972 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 16 Jul 2024 12:09:31 -0500
Subject: [PATCH 20/78] syntax and math stability

---
 my_pyscf/mcscf/lasscf_async/combine.py      | 13 ++++++++++---
 my_pyscf/mcscf/lasscf_async/keyframe.py     |  5 +++--
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  3 +--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 514757b7..799dd207 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -138,6 +138,13 @@ def combine_o0 (las, kf2_list):
     kf1 = relax (las, kf1)
     return kf1
 
+def combine_o1 (las, kf2_list, kf_ref):
+    kf1 = kf2_list[0]
+    for kf2 in kf2_list[1:]:
+        kf1 = combine_o1_rigid (las, kf1, kf2, kf_ref)
+    kf1 = relax (las, kf1)
+    return kf1
+
 def impweights (las, mo_coeff, impurities):
     '''Compute the weights of each MO in mo_coeff on the various impurities.
 
@@ -156,7 +163,7 @@ def impweights (las, mo_coeff, impurities):
         weights.append ((a @ a.conj ().T).diagonal ())
     return np.stack (weights, axis=1)
 
-def combine_impweighted (las, kf1, kf2, kf_ref):
+def combine_o1_rigid (las, kf1, kf2, kf_ref):
     '''Combine two keyframes (without relaxing the active orbitals) by weighting the kappa matrices
     with respect to a third reference keyframe by the impweights parameter
 
@@ -181,10 +188,10 @@ def combine_impweighted (las, kf1, kf2, kf_ref):
     rmat = np.eye (kf_ref.mo_coeff.shape[1])
 
     # Figure out which fragments are associated w the two keyframes
-    offs = np.cumsum (las.ncas_sub) + ncore
+    offs = np.cumsum (las.ncas_sub) + las.ncore
     kf1_frags = []
     kf2_frags = []
-    for i in range (len (las.nfrags)):
+    for i in range (las.nfrags):
         i1 = offs[i]
         i0 = i1 - las.ncas_sub[i]
         # kf1
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 98c607de..b252741c 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -247,7 +247,7 @@ def get_kappa (las, kf1, kf2):
     rmat = u @ vh
 
     # Iteration parameters
-    tol_strict = 1e-8
+    tol_strict = 1e-6
     tol_target = 1e-10
     max_cycle = 100
 
@@ -273,6 +273,7 @@ def get_kappa (las, kf1, kf2):
         skewerr = linalg.norm (kappa + kappa.T) 
         if (skewerr/nmo)>tol_strict:
             log.error ('get_kappa matrix logarithm failed (skewerr = %e)', skewerr)
+        kappa = .5 * (kappa - kappa.T)
         diagerr = 0
         for i in range (len (nblk)):
             i1 = blkoff[i]
@@ -280,7 +281,7 @@ def get_kappa (las, kf1, kf2):
             diagerr = max (diagerr, np.amax (np.abs (kappa[i0:i1,i0:i1])))
             rmat1[i0:i1,i0:i1] = linalg.expm (kappa[i0:i1,i0:i1])
         log.debug ('get_kappa iter %d diagerr: %e', it, diagerr)
-        if (diagerr < tol_target) or ((diagerr<tol_strict) and (diagerr>lasterr)): break
+        if (diagerr < tol_target) or ((lasterr<tol_strict) and (diagerr>lasterr)): break
         # If you run this for infinity cycles it will always diverge. I'd like to get to
         # 1e-10 but if 1e-8 is the best it can do then it should stop there.
         lasterr = diagerr
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index ab446249..a126e4bb 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -8,7 +8,6 @@
 from mrh.my_pyscf.mcscf.lasscf_async import keyframe, combine
 from mrh.my_pyscf.mcscf.lasscf_async.split import get_impurity_space_constructor
 from mrh.my_pyscf.mcscf.lasscf_async.crunch import get_impurity_casscf
-from mrh.my_pyscf.mcscf.lasscf_async.combine import combine_o0
 
 def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             assert_no_dupes=False, verbose=lib.logger.NOTE, frags_orbs=None,
@@ -70,7 +69,7 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             keyframe.get_kappa (las, kfi, kfj)
 
         # 3. Combine from fragments. TODO: smaller chunks instead of one whole-molecule function
-        kf1 = combine_o0 (las, kf2_list)
+        kf1 = combine.combine_o1 (las, kf2_list, kf1)
 
         # Evaluate status and break if converged
         e_tot = las.energy_nuc () + las.energy_elec (

From 1d2528f4881b508b66bd6f9ccb71329acde3a3a1 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 16 Jul 2024 13:49:02 -0500
Subject: [PATCH 21/78] get_kappa complex issues

---
 my_pyscf/mcscf/lasscf_async/combine.py  | 15 +++++++++--
 my_pyscf/mcscf/lasscf_async/keyframe.py | 35 ++++++++++++++++---------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 799dd207..7bd640ff 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -177,6 +177,8 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     Returns:
         kf3 : object of :class:`LASKeyframe`
     '''
+    log = lib.logger.new_logger (las, las.verbose)
+    nmo = las.mo_coeff.shape[1]
     kf3 = kf_ref.copy ()
     w1 = np.add.outer (kf1.impweights, kf2.impweights)
     w2 = np.add.outer (kf1.impweights, kf2.impweights)
@@ -184,8 +186,9 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     kappa2, rmat2 = keyframe.get_kappa (las, kf2, kf_ref)
     denom = w1 + w2
     denom[denom<1e-8] = 1e-8
-    kappa = ((w1*kappa1) + (w2*kappa2)) / denom
-    rmat = np.eye (kf_ref.mo_coeff.shape[1])
+    #kappa = ((w1*kappa1) + (w2*kappa2)) / denom
+    kappa = kappa1 + kappa2
+    rmat = np.eye (nmo) + np.zeros_like (rmat1) + np.zeros_like (rmat2) # complex safety
 
     # Figure out which fragments are associated w the two keyframes
     offs = np.cumsum (las.ncas_sub) + las.ncore
@@ -211,6 +214,14 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
 
     # set orbitals and impweights
     umat = linalg.expm (kappa) @ rmat
+    if np.iscomplexobj (umat):
+        log.warn ('Complex umat constructed. Discarding imaginary part; norm: %e',
+                  linalg.norm (umat.imag))
+        print ("Rmat's fault or kappa's fault or both?",
+               linalg.norm (kappa.imag),
+               linalg.norm (linalg.expm (kappa).imag),
+               linalg.norm (rmat.imag))
+        umat = umat.real
     kf3.mo_coeff = kf_ref.mo_coeff @ umat
     kf3.impweights = kf1.impweights + kf2.impweights
     
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index b252741c..8658d25d 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -242,14 +242,13 @@ def get_kappa (las, kf1, kf2):
     '''
     log = logger.new_logger (las, las.verbose)
 
-    # Initial guess for rmat using orbital_block_svd
+    # Work in orbital block svd basis for numerical stability
     u, svals, vh = orbital_block_svd (las, kf1, kf2)
-    rmat = u @ vh
 
     # Iteration parameters
     tol_strict = 1e-6
     tol_target = 1e-10
-    max_cycle = 100
+    max_cycle = 1000
 
     # Indexing
     nao, nmo = kf1.mo_coeff.shape
@@ -260,20 +259,21 @@ def get_kappa (las, kf1, kf2):
     blkoff = np.cumsum (nblk)
 
     # Iteration
-    mo1 = kf1.mo_coeff
-    mo2 = kf2.mo_coeff
+    mo1 = kf1.mo_coeff @ u
+    mo2 = kf2.mo_coeff @ vh.conj ().T
     s0 = las._scf.get_ovlp ()
     ovlp = mo1.conj ().T @ s0 @ mo2
-    rmat1 = np.zeros_like (rmat)
+    rmat = np.eye (nmo)
     lasterr = 1
     log.debug ('get_kappa: iterating BCH expansion until maximum diagonal element is less than %e',
                tol_target)
     for it in range (max_cycle):
         kappa = linalg.logm (ovlp @ rmat.conj ().T)
-        skewerr = linalg.norm (kappa + kappa.T) 
+        rmat1 = np.zeros_like (kappa)
+        skewerr = linalg.norm (kappa + kappa.conj ().T) 
         if (skewerr/nmo)>tol_strict:
             log.error ('get_kappa matrix logarithm failed (skewerr = %e)', skewerr)
-        kappa = .5 * (kappa - kappa.T)
+        kappa = .5 * (kappa - kappa.conj ().T)
         diagerr = 0
         for i in range (len (nblk)):
             i1 = blkoff[i]
@@ -289,12 +289,23 @@ def get_kappa (las, kf1, kf2):
     if diagerr > tol_strict:
         log.warn ('get_kappa iteration failed after %d cycles with err = %e',
                   it, diagerr)
-    
+
+    # Rollback from orbital_block_svd basis into original basis
+    kappa = u @ kappa @ u.conj ().T
+    rmat = u @ rmat @ vh
+
     # Final check
-    umat = linalg.expm (kappa) @ rmat
-    finalerr = linalg.norm ((umat.conj ().T @ ovlp) - np.eye (nmo))
+    mo1 = kf1.mo_coeff @ linalg.expm (kappa) @ rmat
+    fovlp = mo1.conj ().T @ s0 @ kf2.mo_coeff
+    finalerr = linalg.norm ((fovlp) - np.eye (nmo))
     log.debug ('get_kappa final error = %e', finalerr)
-    assert (finalerr < tol_strict)
+    try:
+        assert (finalerr < tol_strict), '{}'.format (finalerr)
+    except AssertionError as err:
+        np.save ('ovlp.npy', ovlp)
+        np.save ('fovlp.npy', fovlp)
+        print (ovlp.diagonal ())
+        raise (err)
 
     return kappa, rmat
 

From a1c8d26a5ccfb933c728caa085744482997127b5 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 16 Jul 2024 14:31:58 -0500
Subject: [PATCH 22/78] remove print line

---
 my_pyscf/mcscf/lasscf_async/combine.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 7bd640ff..9b3d4980 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -217,10 +217,6 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     if np.iscomplexobj (umat):
         log.warn ('Complex umat constructed. Discarding imaginary part; norm: %e',
                   linalg.norm (umat.imag))
-        print ("Rmat's fault or kappa's fault or both?",
-               linalg.norm (kappa.imag),
-               linalg.norm (linalg.expm (kappa).imag),
-               linalg.norm (rmat.imag))
         umat = umat.real
     kf3.mo_coeff = kf_ref.mo_coeff @ umat
     kf3.impweights = kf1.impweights + kf2.impweights

From 4d24c517b3b4b3089743b17d6f271a19960d7e7c Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 16 Jul 2024 16:04:49 -0500
Subject: [PATCH 23/78] ncore = 0 safety

---
 examples/lasscf_async/h4_631g.py        | 17 +++++++++
 my_pyscf/mcscf/lasscf_async/combine.py  | 15 +++++++-
 my_pyscf/mcscf/lasscf_async/keyframe.py | 49 +++++++++++++------------
 3 files changed, 56 insertions(+), 25 deletions(-)
 create mode 100755 examples/lasscf_async/h4_631g.py

diff --git a/examples/lasscf_async/h4_631g.py b/examples/lasscf_async/h4_631g.py
new file mode 100755
index 00000000..11148834
--- /dev/null
+++ b/examples/lasscf_async/h4_631g.py
@@ -0,0 +1,17 @@
+import numpy as np
+from scipy import linalg
+from pyscf import gto, scf, lib, mcscf
+from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
+
+xyz = '''H 0.0 0.0 0.0
+         H 1.0 0.0 0.0
+         H 0.2 3.9 0.1
+         H 1.159166 4.1 -0.1'''
+mol = gto.M (atom = xyz, basis = '6-31g', output='h4_631g.log',
+    verbose=lib.logger.DEBUG)
+mf = scf.RHF (mol).run ()
+las = LASSCF (mf, (2,2), (2,2), spin_sub=(1,1))
+frag_atom_list = ((0,1),(2,3))
+mo_loc = las.set_fragments_(frag_atom_list, mf.mo_coeff)
+las.kernel (mo_loc)
+
diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 9b3d4980..7e6fee65 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -186,8 +186,7 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     kappa2, rmat2 = keyframe.get_kappa (las, kf2, kf_ref)
     denom = w1 + w2
     denom[denom<1e-8] = 1e-8
-    #kappa = ((w1*kappa1) + (w2*kappa2)) / denom
-    kappa = kappa1 + kappa2
+    kappa = ((w1*kappa1) + (w2*kappa2)) / denom
     rmat = np.eye (nmo) + np.zeros_like (rmat1) + np.zeros_like (rmat2) # complex safety
 
     # Figure out which fragments are associated w the two keyframes
@@ -200,6 +199,7 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
         # kf1
         w = sum (kf1.impweights[i0:i1]) / las.ncas_sub[i]
         if np.isclose (w, 1):
+            kf1_frags.append (i)
             kf3.ci[i] = kf1.ci[i]
             rmat[i0:i1,i0:i1] = rmat1[i0:i1,i0:i1]
         elif abs (w) > 1e-4:
@@ -207,6 +207,7 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
         # kf2
         w = sum (kf2.impweights[i0:i1]) / las.ncas_sub[i]
         if np.isclose (w, 1):
+            kf2_frags.append (i)
             kf3.ci[i] = kf2.ci[i]
             rmat[i0:i1,i0:i1] = rmat2[i0:i1,i0:i1]
         elif abs (w) > 1e-4:
@@ -221,6 +222,16 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     kf3.mo_coeff = kf_ref.mo_coeff @ umat
     kf3.impweights = kf1.impweights + kf2.impweights
     
+    # Double-check active orbitals
+    s0 = las._scf.get_ovlp ()
+    for k, frags in zip ([kf1,kf2], [kf1_frags, kf2_frags]):
+        for i in frags:
+            i1 = offs[i]
+            i0 = i1 - las.ncas_sub[i]
+            ovlp = k.mo_coeff[:,i0:i1].conj ().T @ s0 @ kf3.mo_coeff[:,i0:i1]
+            u, svals, vh = linalg.svd (ovlp)
+            print (sum (ovlp.diagonal ()), sum (svals))
+
     return kf3
 
 
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 8658d25d..03315e3b 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -121,15 +121,20 @@ def orbital_block_svd (las, kf1, kf2):
     nocc = ncore + ncas
     nvirt = nmo - nocc
 
+    u = []
+    svals = []
+    vh = []
+
     s0 = las._scf.get_ovlp ()
-    mo1 = kf1.mo_coeff[:,:ncore]
-    mo2 = kf2.mo_coeff[:,:ncore]
-    s1 = mo1.conj ().T @ s0 @ mo2
-    u_core, svals_core, vh_core = svd (s1)
-
-    u = [u_core,]
-    svals = [svals_core,]
-    vh = [vh_core,]
+    if ncore:
+        mo1 = kf1.mo_coeff[:,:ncore]
+        mo2 = kf2.mo_coeff[:,:ncore]
+        s1 = mo1.conj ().T @ s0 @ mo2
+        u_core, svals_core, vh_core = svd (s1)
+        u.append (u_core)
+        svals.append (svals_core)
+        vh.append (vh_core)
+
     for ifrag, (fcibox, c1_r, c2_r) in enumerate (zip (las.fciboxes, kf1.ci, kf2.ci)):
         nlas, nelelas = las.ncas_sub[ifrag], las.nelecas_sub[ifrag]
         i = ncore + sum (las.ncas_sub[:ifrag])
@@ -142,13 +147,14 @@ def orbital_block_svd (las, kf1, kf2):
         svals.append (svals_i)
         vh.append (vh_i)
 
-    mo1 = kf1.mo_coeff[:,nocc:]
-    mo2 = kf2.mo_coeff[:,nocc:]
-    s1 = mo1.conj ().T @ s0 @ mo2
-    u_virt, svals_virt, vh_virt = svd (s1)
-    u.append (u_virt)
-    svals.append (svals_virt)
-    vh.append (vh_virt)
+    if nvirt:
+        mo1 = kf1.mo_coeff[:,nocc:]
+        mo2 = kf2.mo_coeff[:,nocc:]
+        s1 = mo1.conj ().T @ s0 @ mo2
+        u_virt, svals_virt, vh_virt = svd (s1)
+        u.append (u_virt)
+        svals.append (svals_virt)
+        vh.append (vh_virt)
 
     u = linalg.block_diag (*u)
     svals = np.concatenate (svals)
@@ -255,7 +261,10 @@ def get_kappa (las, kf1, kf2):
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
     nvirt = nmo - nocc
-    nblk = [ncore,] + list (las.ncas_sub) + [nvirt,]
+    nblk = []
+    if ncore: nblk.append (ncore)
+    nblk += list (las.ncas_sub)
+    if nvirt: nblk.append (nvirt)
     blkoff = np.cumsum (nblk)
 
     # Iteration
@@ -299,13 +308,7 @@ def get_kappa (las, kf1, kf2):
     fovlp = mo1.conj ().T @ s0 @ kf2.mo_coeff
     finalerr = linalg.norm ((fovlp) - np.eye (nmo))
     log.debug ('get_kappa final error = %e', finalerr)
-    try:
-        assert (finalerr < tol_strict), '{}'.format (finalerr)
-    except AssertionError as err:
-        np.save ('ovlp.npy', ovlp)
-        np.save ('fovlp.npy', fovlp)
-        print (ovlp.diagonal ())
-        raise (err)
+    assert (finalerr < tol_strict), '{}'.format (finalerr)
 
     return kappa, rmat
 

From 1c9a53e6a10250d42549ebff16108033364242d4 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 11:42:17 -0500
Subject: [PATCH 24/78] delete printing

---
 my_pyscf/mcscf/lasscf_async/combine.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 7e6fee65..0d8edb66 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -222,16 +222,6 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     kf3.mo_coeff = kf_ref.mo_coeff @ umat
     kf3.impweights = kf1.impweights + kf2.impweights
     
-    # Double-check active orbitals
-    s0 = las._scf.get_ovlp ()
-    for k, frags in zip ([kf1,kf2], [kf1_frags, kf2_frags]):
-        for i in frags:
-            i1 = offs[i]
-            i0 = i1 - las.ncas_sub[i]
-            ovlp = k.mo_coeff[:,i0:i1].conj ().T @ s0 @ kf3.mo_coeff[:,i0:i1]
-            u, svals, vh = linalg.svd (ovlp)
-            print (sum (ovlp.diagonal ()), sum (svals))
-
     return kf3
 
 

From c33cdbda00c385cc42ba6406a2d59062db848b93 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 12:17:11 -0500
Subject: [PATCH 25/78] keyframe.democratic_matrix

This kappa summing business just doesn't work
---
 my_pyscf/mcscf/lasscf_async/combine.py  | 44 +++++++++----------------
 my_pyscf/mcscf/lasscf_async/crunch.py   |  7 ++--
 my_pyscf/mcscf/lasscf_async/keyframe.py | 35 +++++++++++++++++++-
 3 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 0d8edb66..d0e0dd22 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -165,7 +165,7 @@ def impweights (las, mo_coeff, impurities):
 
 def combine_o1_rigid (las, kf1, kf2, kf_ref):
     '''Combine two keyframes (without relaxing the active orbitals) by weighting the kappa matrices
-    with respect to a third reference keyframe by the impweights parameter
+    with respect to a third reference keyframe democratically
 
     Args:
         las : object of :class:`LASCINoSymm`
@@ -180,47 +180,33 @@ def combine_o1_rigid (las, kf1, kf2, kf_ref):
     log = lib.logger.new_logger (las, las.verbose)
     nmo = las.mo_coeff.shape[1]
     kf3 = kf_ref.copy ()
-    w1 = np.add.outer (kf1.impweights, kf2.impweights)
-    w2 = np.add.outer (kf1.impweights, kf2.impweights)
     kappa1, rmat1 = keyframe.get_kappa (las, kf1, kf_ref)
     kappa2, rmat2 = keyframe.get_kappa (las, kf2, kf_ref)
-    denom = w1 + w2
-    denom[denom<1e-8] = 1e-8
-    kappa = ((w1*kappa1) + (w2*kappa2)) / denom
+    kappa1 = keyframe.democratic_matrix (las, kappa1, kf1.frags, kf_ref.mo_coeff)
+    kappa2 = keyframe.democratic_matrix (las, kappa2, kf2.frags, kf_ref.mo_coeff)
+    kappa = kappa1 + kappa2
     rmat = np.eye (nmo) + np.zeros_like (rmat1) + np.zeros_like (rmat2) # complex safety
 
-    # Figure out which fragments are associated w the two keyframes
     offs = np.cumsum (las.ncas_sub) + las.ncore
-    kf1_frags = []
-    kf2_frags = []
-    for i in range (las.nfrags):
+    for i in kf1.frags:
         i1 = offs[i]
         i0 = i1 - las.ncas_sub[i]
-        # kf1
-        w = sum (kf1.impweights[i0:i1]) / las.ncas_sub[i]
-        if np.isclose (w, 1):
-            kf1_frags.append (i)
-            kf3.ci[i] = kf1.ci[i]
-            rmat[i0:i1,i0:i1] = rmat1[i0:i1,i0:i1]
-        elif abs (w) > 1e-4:
-            raise RuntimeError ("fragment split between impurities? ({})".format (w))
-        # kf2
-        w = sum (kf2.impweights[i0:i1]) / las.ncas_sub[i]
-        if np.isclose (w, 1):
-            kf2_frags.append (i)
-            kf3.ci[i] = kf2.ci[i]
-            rmat[i0:i1,i0:i1] = rmat2[i0:i1,i0:i1]
-        elif abs (w) > 1e-4:
-            raise RuntimeError ("fragment split between impurities? ({})".format (w))
-
-    # set orbitals and impweights
+        kf3.ci[i] = kf1.ci[i]
+        rmat[i0:i1,i0:i1] = rmat1[i0:i1,i0:i1]
+    for i in kf2.frags:
+        i1 = offs[i]
+        i0 = i1 - las.ncas_sub[i]
+        kf3.ci[i] = kf2.ci[i]
+        rmat[i0:i1,i0:i1] = rmat2[i0:i1,i0:i1]
+
+    # set orbitals and frag associations
     umat = linalg.expm (kappa) @ rmat
     if np.iscomplexobj (umat):
         log.warn ('Complex umat constructed. Discarding imaginary part; norm: %e',
                   linalg.norm (umat.imag))
         umat = umat.real
     kf3.mo_coeff = kf_ref.mo_coeff @ umat
-    kf3.impweights = kf1.impweights + kf2.impweights
+    kf3.frags = kf1.frags.union (kf2.frags)
     
     return kf3
 
diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 75c87537..30bfd0e3 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -354,15 +354,11 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         if ci is None: ci=self.ci
         log = logger.new_logger (self, self.verbose)
         kf2 = kf1.copy ()
+        kf2.frags = set ([self._ifrag,])
         imporb_coeff = self.mol.get_imporb_coeff ()
         mo_self = imporb_coeff @ mo_coeff
         las = self.mol._las
 
-        # impweights for combining updates
-        s0 = las._scf.get_ovlp ()
-        ovlp = kf1.mo_coeff.conj ().T @ s0 @ imporb_coeff
-        kf2.impweights = (ovlp @ ovlp.conj ().T).diagonal ()
-
         # active orbital part should be easy
         kf2.ci[self._ifrag] = self.ci
         i = las.ncore + sum (las.ncas_sub[:self._ifrag])
@@ -372,6 +368,7 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         kf2.mo_coeff[:,i:j] = mo_self[:,k:l]
 
         # Unentangled inactive orbitals
+        s0 = las._scf.get_ovlp ()
         ncore_unent = las.ncore - self.ncore
         assert (ncore_unent>=0), '{} {}'.format (las.ncore, self.ncore)
         if las.ncore:
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 03315e3b..2687984b 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -11,7 +11,7 @@ def __init__(self, las, mo_coeff, ci):
         self.mo_coeff = mo_coeff
         self.ci = ci
         self._dm1s = self._veff = self._fock1 = self._h1eff_sub = self._h2eff_sub = None
-        self.impweights = None
+        self.frags = set ()
 
     @property
     def dm1s (self):
@@ -312,7 +312,40 @@ def get_kappa (las, kf1, kf2):
 
     return kappa, rmat
 
+def democratic_matrix (las, mat, frags, mo_coeff):
+    '''Weight a matrix in the "democratic DMET" way
 
+    Args:
+        las : object of :class:`LASCINoSymm`
+        mat : ndarray of shape (nmo, nmo)
+            In basis of mo_coeff
+        frags : sequence of integers
+            Identify fragments
+        mo_coeff : ndarray of shape (nao, nmo)
+            MO basis of mat
+
+    Returns:
+        mat : ndarray of shape (nmo, nmo)
+            Diagonal environment block eliminated; off-diagonal frag-env block halved
+    '''
+    assert (len (frags))
+    frag_orbs = []
+    for ifrag in frags:
+        frag_orbs.extend (las.frags_orbs[ifrag])
+    frag_orbs = list (set (frag_orbs))
+
+    s0 = las._scf.get_ovlp ()[frag_orbs,:][:,frag_orbs]
+    mo = mo_coeff[frag_orbs,:]
+    s1 = mo.conj ().T @ s0 @ mo
+    w, u = linalg.eigh (-s1)
+
+    mat = u.conj ().T @ mat @ u
+    n = len (frag_orbs)
+    mat[n:,:n] *= .5
+    mat[:n,n:] *= .5
+    mat[n:,n:] = 0
+
+    return u @ mat @ u.conj ().T
 
 
 

From 224b3a75f56e23bfe5747e218ec2f7440fabbfd0 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 12:48:40 -0500
Subject: [PATCH 26/78] change strategies: use orth_orb and relax

but modify them to work on subsets of the whole problem so they
can be desyncronized
---
 my_pyscf/mcscf/lasscf_async/combine.py      | 37 +++++++++++++++------
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  2 +-
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index d0e0dd22..3279e93a 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -18,12 +18,13 @@ def orth_orb (las, kf2_list):
     # orthonormalize active orbitals
     mo_cas = np.empty ((nao, ncas), dtype=las.mo_coeff.dtype)
     ci = []
-    for ifrag, kf2 in enumerate (kf2_list):
-        i = sum (las.ncas_sub[:ifrag])
-        j = i + las.ncas_sub[ifrag]
-        k, l = i + ncore, j + ncore
-        mo_cas[:,i:j] = kf2.mo_coeff[:,k:l]
-        ci.append (kf2.ci[ifrag])
+    for kf2 in kf2_list:
+        for ifrag in kf2.frags:
+            i = sum (las.ncas_sub[:ifrag])
+            j = i + las.ncas_sub[ifrag]
+            k, l = i + ncore, j + ncore
+            mo_cas[:,i:j] = kf2.mo_coeff[:,k:l]
+            ci.append (kf2.ci[ifrag])
     mo_cas_preorth = mo_cas.copy ()
     s0 = las._scf.get_ovlp ()
     mo_cas = orth.vec_lowdin (mo_cas_preorth, s=s0)
@@ -64,8 +65,8 @@ def orth_orb (las, kf2_list):
         log.warn ('Non-orthogonal AOs in lasscf_async.combine.orth_orb: %e', errmax)
     mo1 = mo1[:,ncas:]
     if mo1.size:
-        veff = sum ([kf2.veff for kf2 in kf2_list]) / nfrags
-        dm1s = sum ([kf2.dm1s for dm1s in kf2_list]) / nfrags
+        veff = sum ([kf2.veff for kf2 in kf2_list]) / len (kf2_list)
+        dm1s = sum ([kf2.dm1s for dm1s in kf2_list]) / len (kf2_list)
         fock = las.get_hcore ()[None,:,:] + veff
         fock = get_roothaan_fock (fock, dm1s, s0)
         orbsym = None # TODO: symmetry
@@ -105,7 +106,8 @@ def __exit__(self, type, value, traceback):
         if getattr (self.las, 'with_df', None):
             self.las.with_df.stdout = self.las_stdout
 
-def relax (las, kf):
+def relax (las, kf, freeze_inactive=False, frozen_frags=None):
+    if frozen_frags is None: frozen_frags = []
     log = lib.logger.new_logger (las, las.verbose)
     flas_stdout = getattr (las, '_flas_stdout', None)
     if flas_stdout is None:
@@ -124,6 +126,17 @@ def relax (las, kf):
     with flas_stdout_env (las, flas_stdout):
         flas = lasci.LASCI (las._scf, las.ncas_sub, las.nelecas_sub)
         flas.__dict__.update (las.__dict__)
+        flas.frozen = []
+        if freeze_inactive:
+            flas.frozen.extend (list (range (las.ncore)))
+        for ifrag in frozen_frags:
+            i0 = las.ncore + sum (las.ncas_sub[:ifrag])
+            i1 = i0 + las.ncas_sub[ifrag]
+            flas.frozen.extend (list (range (i0,i1)))
+        if freeze_inactive:
+            nocc = las.ncore + las.ncas
+            nmo = kf.mo_coeff.shape[1]
+            flas.frozen.extend (list (range (nocc,nmo)))
         e_tot, e_cas, ci, mo_coeff, mo_energy, h2eff_sub, veff = \
             flas.kernel (kf.mo_coeff, ci0=kf.ci)
     ovlp = mo_coeff.conj ().T @ las._scf.get_ovlp () @ mo_coeff
@@ -138,10 +151,12 @@ def combine_o0 (las, kf2_list):
     kf1 = relax (las, kf1)
     return kf1
 
-def combine_o1 (las, kf2_list, kf_ref):
+def combine_o1 (las, kf2_list):
     kf1 = kf2_list[0]
     for kf2 in kf2_list[1:]:
-        kf1 = combine_o1_rigid (las, kf1, kf2, kf_ref)
+        kf1_frags = kf1.frags
+        kf1 = orth_orb (las, [kf1,kf2])
+        kf1.frags = kf1_frags.union (kf2.frags)
     kf1 = relax (las, kf1)
     return kf1
 
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index a126e4bb..e6d284f4 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -69,7 +69,7 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             keyframe.get_kappa (las, kfi, kfj)
 
         # 3. Combine from fragments. TODO: smaller chunks instead of one whole-molecule function
-        kf1 = combine.combine_o1 (las, kf2_list, kf1)
+        kf1 = combine.combine_o1 (las, kf2_list)
 
         # Evaluate status and break if converged
         e_tot = las.energy_nuc () + las.energy_elec (

From ca52acafd6c334a6a94f36f6239b27796523dbbd Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 13:38:38 -0500
Subject: [PATCH 27/78] combine_pair and orth_orb desync

---
 my_pyscf/mcscf/lasscf_async/combine.py | 59 +++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 3279e93a..5ea82f49 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -8,7 +8,7 @@
 from mrh.my_pyscf.mcscf.lasscf_async import keyframe
 
 # TODO: symmetry
-def orth_orb (las, kf2_list):
+def orth_orb (las, kf2_list, kf_ref=None):
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
     nao, nmo = las.mo_coeff.shape
@@ -17,14 +17,17 @@ def orth_orb (las, kf2_list):
 
     # orthonormalize active orbitals
     mo_cas = np.empty ((nao, ncas), dtype=las.mo_coeff.dtype)
-    ci = []
+    if kf_ref is not None:
+        ci = [c for c in kf_ref.ci]
+    else:
+        ci = [None for i in range (las.nfrags)]
     for kf2 in kf2_list:
         for ifrag in kf2.frags:
             i = sum (las.ncas_sub[:ifrag])
             j = i + las.ncas_sub[ifrag]
             k, l = i + ncore, j + ncore
             mo_cas[:,i:j] = kf2.mo_coeff[:,k:l]
-            ci.append (kf2.ci[ifrag])
+            ci[ifrag] = kf2.ci[ifrag]
     mo_cas_preorth = mo_cas.copy ()
     s0 = las._scf.get_ovlp ()
     mo_cas = orth.vec_lowdin (mo_cas_preorth, s=s0)
@@ -154,12 +157,54 @@ def combine_o0 (las, kf2_list):
 def combine_o1 (las, kf2_list):
     kf1 = kf2_list[0]
     for kf2 in kf2_list[1:]:
-        kf1_frags = kf1.frags
-        kf1 = orth_orb (las, [kf1,kf2])
-        kf1.frags = kf1_frags.union (kf2.frags)
-    kf1 = relax (las, kf1)
+        kf1 = combine_pair (las, kf1, kf2)
     return kf1
 
+def select_aa_block (las, frags1, frags2, fock1):
+    '''Identify from two lists of candidate fragments the single active-active orbital-rotation
+    gradient block with the largest norm
+
+    Args:
+        las : object of :class:`LASCINoSymm`
+        frags1 : sequence of integers
+        frags2 : sequence of integers
+        fock1 : ndarray of shape (nmo,nmo)
+
+    Returns:
+        i : integer
+            From frags1.
+        j : integer
+            From frags2.
+'''
+    frags1 = list (frags1)
+    frags2 = list (frags2)
+    g_orb = fock1 - fock1.conj ().T
+    ncore = las.ncore
+    nocc = ncore + las.ncas
+    g_orb = g_orb[ncore:nocc,ncore:nocc]
+    gblk = []
+    for ix, i in enumerate (frags1):
+        i1 = sum (las.ncas_sub[:i])
+        i0 = i1 - las.ncas_sub[i]
+        for jx, j in enumerate (frags2):
+            j1 = sum (las.ncas_sub[:j])
+            j0 = j1 - las.ncas_sub[j]
+            gblk.append (linalg.norm (g_orb[i0:i1,j0:j1]))
+    gmax = np.argmax (gblk)
+    i = frags1[gmax // len (frags2)]
+    j = frags2[gmax % len (frags2)]
+    return i, j
+
+def combine_pair (las, kf1, kf2):
+    '''Combine two keyframes and relax one specific block of active-active orbital rotations
+    between the fragments assigned to each with the inactive and virtual orbitals frozen.'''
+    kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf1)
+    i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
+    frozen = [k for k in range (las.nfrags) if k not in (i,j)]
+    kf3 = relax (las, kf3, freeze_inactive=True, frozen_frags=frozen)
+    kf3.frags = kf1.frags.union (kf2.frags)
+    return kf3
+
 def impweights (las, mo_coeff, impurities):
     '''Compute the weights of each MO in mo_coeff on the various impurities.
 

From 012639adc869128013fcc6346332eba709a8dfde Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 14:03:51 -0500
Subject: [PATCH 28/78] comment todos

---
 my_pyscf/mcscf/lasscf_async/combine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 5ea82f49..bd3a0c44 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -110,6 +110,8 @@ def __exit__(self, type, value, traceback):
             self.las.with_df.stdout = self.las_stdout
 
 def relax (las, kf, freeze_inactive=False, frozen_frags=None):
+    # TODO: frozen CI-vector elements in flas subproblem solver
+    # TODO: bottom-up 2-frag subproblem reimplementation
     if frozen_frags is None: frozen_frags = []
     log = lib.logger.new_logger (las, las.verbose)
     flas_stdout = getattr (las, '_flas_stdout', None)

From aaf5d86daf9e0330f5773184c27c6a5d1ac7d8c4 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 15:32:36 -0500
Subject: [PATCH 29/78] lasci_sync frozen_ci implementation

---
 my_pyscf/mcscf/lasci.py      |  3 ++-
 my_pyscf/mcscf/lasci_sync.py | 39 +++++++++++++++++++++++++++---------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/my_pyscf/mcscf/lasci.py b/my_pyscf/mcscf/lasci.py
index 2d11b823..a92e713b 100644
--- a/my_pyscf/mcscf/lasci.py
+++ b/my_pyscf/mcscf/lasci.py
@@ -880,7 +880,7 @@ def get_nelec_frs (las):
 
 class LASCINoSymm (casci.CASCI):
 
-    def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, frozen=None, **kwargs):
+    def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, frozen=None, frozen_ci=None, **kwargs):
         if isinstance(ncas,int):
             ncas = [ncas]
         ncas_tot = sum (ncas)
@@ -904,6 +904,7 @@ def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, frozen=None, **
         self.nelecas_sub = np.asarray (nelecas)
         assert (len (self.nelecas_sub) == self.nfrags)
         self.frozen = frozen
+        self.frozen_ci = frozen_ci
         self.conv_tol_grad = 1e-4
         self.conv_tol_self = 1e-10
         self.ah_level_shift = 1e-8
diff --git a/my_pyscf/mcscf/lasci_sync.py b/my_pyscf/mcscf/lasci_sync.py
index 4ab65fc5..2f7c9d61 100644
--- a/my_pyscf/mcscf/lasci_sync.py
+++ b/my_pyscf/mcscf/lasci_sync.py
@@ -252,6 +252,8 @@ def my_callback (x):
 
 def ci_cycle (las, mo, ci0, veff, h2eff_sub, casdm1frs, log):
     if ci0 is None: ci0 = [None for idx in range (las.nfrags)]
+    frozen_ci = las.frozen_ci
+    if frozen_ci is None: frozen_ci = []
     # CI problems
     t1 = (lib.logger.process_clock(), lib.logger.perf_counter())
     h1eff_sub = las.get_h1eff (mo, veff=veff, h2eff_sub=h2eff_sub, casdm1frs=casdm1frs)
@@ -288,10 +290,13 @@ def ci_cycle (las, mo, ci0, veff, h2eff_sub, casdm1frs, log):
                 log.debug1 ("LASCI subspace {} state {} with wfnsym {}".format (isub, state,
                                                                                 wfnsym_str))
 
-        e_sub, fcivec = fcibox.kernel(h1e, eri_cas, ncas, nelecas,
-                                      ci0=fcivec, verbose=log,
-                                      #max_memory = max_memory issue #54
-                                      ecore=e0, orbsym=orbsym)
+        if isub not in frozen_ci:
+            e_sub, fcivec = fcibox.kernel(h1e, eri_cas, ncas, nelecas,
+                                          ci0=fcivec, verbose=log,
+                                          #max_memory = max_memory issue #54
+                                          ecore=e0, orbsym=orbsym)
+        else:
+            e_sub = 0 # TODO: proper energy calculation (probably doesn't matter tho)
         e_cas.append (e_sub)
         ci1.append (fcivec)
         t1 = log.timer ('FCI box for subspace {}'.format (isub), *t1)
@@ -342,6 +347,8 @@ class LASCI_UnitaryGroupGenerators (object):
             Number of molecular orbitals
         frozen : sequence of int or index mask array
             Identify orbitals which are frozen.
+        frozen_ci : sequence of int
+            Identify fragments whose CI vectors are frozen
         nfrz_orb_idx : index mask array
             Identifies all nonredundant orbital rotation amplitudes for non-frozen orbitals
         uniq_orb_idx : index mask array
@@ -363,6 +370,7 @@ class LASCI_UnitaryGroupGenerators (object):
     def __init__(self, las, mo_coeff, ci):
         self.nmo = mo_coeff.shape[-1]
         self.frozen = las.frozen
+        self.frozen_ci = las.frozen_ci
         self._init_orb (las, mo_coeff, ci)
         self._init_ci (las, mo_coeff, ci)
 
@@ -391,6 +399,7 @@ def get_gx_idx (self):
 
     def _init_ci (self, las, mo_coeff, ci):
         self.ci_transformers = []
+        if self.frozen_ci is None: self.frozen_ci = []
         for i, fcibox in enumerate (las.fciboxes):
             norb, nelec = las.ncas_sub[i], las.nelecas_sub[i]
             tf_list = []
@@ -407,7 +416,8 @@ def _init_ci (self, las, mo_coeff, ci):
 
     def pack (self, kappa, ci_sub):
         x = kappa[self.uniq_orb_idx]
-        for trans_frag, ci_frag in zip (self.ci_transformers, ci_sub):
+        for ix, (trans_frag, ci_frag) in enumerate (zip (self.ci_transformers, ci_sub)):
+            if ix in self.frozen_ci: continue
             for transformer, ci in zip (trans_frag, ci_frag):
                 x = np.append (x, transformer.vec_det2csf (ci, normalize=False))
         assert (x.shape[0] == self.nvar_tot)
@@ -420,12 +430,17 @@ def unpack (self, x):
 
         y = x[self.nvar_orb:]
         ci_sub = []
-        for trans_frag in self.ci_transformers:
+        for ix, trans_frag in enumerate (self.ci_transformers):
             ci_frag = []
             for transformer in trans_frag:
-                ncsf = transformer.ncsf
-                ci_frag.append (transformer.vec_csf2det (y[:ncsf], normalize=False))
-                y = y[ncsf:]
+                if ix in self.frozen_ci:
+                    ndeta = transformer.ndeta
+                    ndetb = transformer.ndetb
+                    ci_frag.append (np.zeros ((ndeta,ndetb)))
+                else:
+                    ncsf = transformer.ncsf
+                    ci_frag.append (transformer.vec_csf2det (y[:ncsf], normalize=False))
+                    y = y[ncsf:]
             ci_sub.append (ci_frag)
 
         return kappa, ci_sub
@@ -438,6 +453,7 @@ def addr2idstr (self, addr):
             addr -= self.nvar_orb
             ncsf_frag = self.ncsf_sub.sum (1)
             for i, trans_frag in enumerate (self.ci_transformers):
+                if i in self.frozen_ci: continue
                 if addr >= ncsf_frag[i]:
                     addr -= ncsf_frag[i]
                     continue
@@ -458,7 +474,8 @@ def nvar_orb (self):
     @property
     def ncsf_sub (self):
         return np.asarray ([[transformer.ncsf for transformer in trans_frag]
-                            for trans_frag in self.ci_transformers])
+                            for i,trans_frag in enumerate (self.ci_transformers)
+                            if i not in self.frozen_ci])
 
     @property
     def nvar_tot (self):
@@ -475,6 +492,7 @@ class LASCISymm_UnitaryGroupGenerators (LASCI_UnitaryGroupGenerators):
     def __init__(self, las, mo_coeff, ci): 
         self.nmo = mo_coeff.shape[-1]
         self.frozen = las.frozen
+        self.frozen_ci = las.frozen_ci
         if getattr (mo_coeff, 'orbsym', None) is None:
             mo_coeff = las.label_symmetry_(mo_coeff)
         orbsym = mo_coeff.orbsym
@@ -488,6 +506,7 @@ def _init_orb (self, las, mo_coeff, ci, orbsym):
         self.nfrz_orb_idx[self.symm_forbid] = False
 
     def _init_ci (self, las, mo_coeff, ci, orbsym):
+        if self.frozen_ci is None: self.frozen_ci = []
         sub_slice = np.cumsum ([0] + las.ncas_sub.tolist ()) + las.ncore
         orbsym_sub = [orbsym[i:sub_slice[isub+1]] for isub, i in enumerate (sub_slice[:-1])]
         self.ci_transformers = []

From dfd7973a34ac493de59091291fb3f7e18fcbad41 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 15:33:54 -0500
Subject: [PATCH 30/78] frozen_ci use in combine_pair

---
 my_pyscf/mcscf/lasscf_async/combine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index bd3a0c44..4990000d 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -110,7 +110,6 @@ def __exit__(self, type, value, traceback):
             self.las.with_df.stdout = self.las_stdout
 
 def relax (las, kf, freeze_inactive=False, frozen_frags=None):
-    # TODO: frozen CI-vector elements in flas subproblem solver
     # TODO: bottom-up 2-frag subproblem reimplementation
     if frozen_frags is None: frozen_frags = []
     log = lib.logger.new_logger (las, las.verbose)
@@ -132,6 +131,7 @@ def relax (las, kf, freeze_inactive=False, frozen_frags=None):
         flas = lasci.LASCI (las._scf, las.ncas_sub, las.nelecas_sub)
         flas.__dict__.update (las.__dict__)
         flas.frozen = []
+        flas.frozen_ci = frozen_frags
         if freeze_inactive:
             flas.frozen.extend (list (range (las.ncore)))
         for ifrag in frozen_frags:

From c8cb6a61703135e26473e54ae663a364b21a1f41 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 16:40:46 -0500
Subject: [PATCH 31/78] lasscf_async combine_o1 not crashing or converging

The failure to converge even the simplest 3-fragment molecules
appears to be due primarily to the coupling between i<->j and i<->k
orbital rotations
---
 my_pyscf/mcscf/lasci_sync.py           |  6 ++++--
 my_pyscf/mcscf/lasscf_async/combine.py | 18 ++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/my_pyscf/mcscf/lasci_sync.py b/my_pyscf/mcscf/lasci_sync.py
index 2f7c9d61..4718af8a 100644
--- a/my_pyscf/mcscf/lasci_sync.py
+++ b/my_pyscf/mcscf/lasci_sync.py
@@ -102,7 +102,8 @@ def kernel (las, mo_coeff=None, ci0=None, casdm0_fr=None, conv_tol_grad=1e-4,
                 err = linalg.norm (g_orb_test - g_vec[:ugg.nvar_orb])
                 log.debug ('GRADIENT IMPLEMENTATION TEST: |D g_orb| = %.15g', err)
                 assert (err < 1e-5), '{}'.format (err)
-            for isub in range (len (ci1)): # TODO: double-check that this code works in SA-LASSCF
+            for isub in range (len (ugg.ncsf_sub)):
+                # TODO: double-check that this code works in SA-LASSCF
                 i = ugg.ncsf_sub[:isub].sum ()
                 j = i + ugg.ncsf_sub[isub].sum ()
                 k = i + ugg.nvar_orb
@@ -436,7 +437,7 @@ def unpack (self, x):
                 if ix in self.frozen_ci:
                     ndeta = transformer.ndeta
                     ndetb = transformer.ndetb
-                    ci_frag.append (np.zeros ((ndeta,ndetb)))
+                    ci_frag.append (np.zeros ((ndeta*ndetb)))
                 else:
                     ncsf = transformer.ncsf
                     ci_frag.append (transformer.vec_csf2det (y[:ncsf], normalize=False))
@@ -1288,6 +1289,7 @@ def _get_Hci_diag (self):
         Hci_diag = []
         for ix, (fcibox, norb, nelec, h1rs, csf_list) in enumerate (zip (self.fciboxes, 
          self.ncas_sub, self.nelecas_sub, self.h1frs, self.ugg.ci_transformers)):
+            if ix in self.ugg.frozen_ci: continue
             i = sum (self.ncas_sub[:ix])
             j = i + norb
             h2 = self.eri_cas[i:j,i:j,i:j,i:j]
diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 4990000d..f0134910 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -12,15 +12,16 @@ def orth_orb (las, kf2_list, kf_ref=None):
     ncore, ncas = las.ncore, las.ncas
     nocc = ncore + ncas
     nao, nmo = las.mo_coeff.shape
-    nfrags = len (kf2_list)
+    nfrags = las.nfrags
     log = lib.logger.new_logger (las, las.verbose)
 
     # orthonormalize active orbitals
-    mo_cas = np.empty ((nao, ncas), dtype=las.mo_coeff.dtype)
     if kf_ref is not None:
         ci = [c for c in kf_ref.ci]
+        mo_cas = kf_ref.mo_coeff[:,ncore:nocc].copy ()
     else:
         ci = [None for i in range (las.nfrags)]
+        mo_cas = np.empty ((nao, ncas), dtype=las.mo_coeff.dtype)
     for kf2 in kf2_list:
         for ifrag in kf2.frags:
             i = sum (las.ncas_sub[:ifrag])
@@ -185,16 +186,17 @@ def select_aa_block (las, frags1, frags2, fock1):
     nocc = ncore + las.ncas
     g_orb = g_orb[ncore:nocc,ncore:nocc]
     gblk = []
-    for ix, i in enumerate (frags1):
-        i1 = sum (las.ncas_sub[:i])
-        i0 = i1 - las.ncas_sub[i]
-        for jx, j in enumerate (frags2):
-            j1 = sum (las.ncas_sub[:j])
-            j0 = j1 - las.ncas_sub[j]
+    for i in frags1:
+        i0 = sum (las.ncas_sub[:i])
+        i1 = i0 + las.ncas_sub[i]
+        for j in frags2:
+            j0 = sum (las.ncas_sub[:j])
+            j1 = j0 + las.ncas_sub[j]
             gblk.append (linalg.norm (g_orb[i0:i1,j0:j1]))
     gmax = np.argmax (gblk)
     i = frags1[gmax // len (frags2)]
     j = frags2[gmax % len (frags2)]
+    print (i, j, gblk[gmax])
     return i, j
 
 def combine_pair (las, kf1, kf2):

From be2571120e041ba8ba168906bdbd5823733bc7bb Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 17 Jul 2024 17:00:15 -0500
Subject: [PATCH 32/78] first lasscf_async combine_o1 3-frag convergence

The problem in the previous commit was actually due to the
evaluation of the convergence tolerance. In the subproblem, the
gradient was lower than the tolerance, but the overall gradient
norm was larger than it, so the iteration just stopped making
progress. Enforce a minimum of 1 cycle through the subproblem
iteration brute-force solves this issue for now.
---
 my_pyscf/mcscf/lasci.py                | 1 +
 my_pyscf/mcscf/lasci_sync.py           | 8 +++++---
 my_pyscf/mcscf/lasscf_async/combine.py | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/my_pyscf/mcscf/lasci.py b/my_pyscf/mcscf/lasci.py
index a92e713b..7bae5cb1 100644
--- a/my_pyscf/mcscf/lasci.py
+++ b/my_pyscf/mcscf/lasci.py
@@ -910,6 +910,7 @@ def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, frozen=None, fr
         self.ah_level_shift = 1e-8
         self.max_cycle_macro = 50
         self.max_cycle_micro = 5
+        self.min_cycle_macro = 0
         keys = set(('e_states', 'fciboxes', 'nroots', 'weights', 'ncas_sub', 'nelecas_sub',
                     'conv_tol_grad', 'conv_tol_self', 'max_cycle_macro', 'max_cycle_micro',
                     'ah_level_shift', 'states_converged', 'chkfile', 'e_lexc'))
diff --git a/my_pyscf/mcscf/lasci_sync.py b/my_pyscf/mcscf/lasci_sync.py
index 4718af8a..a8d9a4b5 100644
--- a/my_pyscf/mcscf/lasci_sync.py
+++ b/my_pyscf/mcscf/lasci_sync.py
@@ -128,9 +128,11 @@ def kernel (las, mo_coeff=None, ci0=None, casdm0_fr=None, conv_tol_grad=1e-4,
         #    ('LASCI micro init : E = %.15g ; |g_orb| = %.15g ; |g_ci| = %.15g ; |x0_orb| = %.15g '
         #    '; |x0_ci| = %.15g'), H_op.e_tot, norm_gorb, norm_gci, norm_xorb, norm_xci)
         las.dump_chk (mo_coeff=mo_coeff, ci=ci1)
-        if (norm_gorb<conv_tol_grad and norm_gci<conv_tol_grad)or((norm_gorb+norm_gci)<norm_gx/10):
-            converged = True
-            break
+        if (((norm_gorb<conv_tol_grad and norm_gci<conv_tol_grad)
+             or ((norm_gorb+norm_gci)<norm_gx/10))
+            and (it>=las.min_cycle_macro)):
+                converged = True
+                break
         H_op._init_eri_() 
         # ^ This is down here to save time in case I am already converged at initialization
         t1 = log.timer ('LASCI Hessian constructor', *t1)
diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index f0134910..db5e005d 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -133,6 +133,8 @@ def relax (las, kf, freeze_inactive=False, frozen_frags=None):
         flas.__dict__.update (las.__dict__)
         flas.frozen = []
         flas.frozen_ci = frozen_frags
+        # TODO: ensure robust tolerance selection so things always make progress
+        flas.min_cycle_macro = 1
         if freeze_inactive:
             flas.frozen.extend (list (range (las.ncore)))
         for ifrag in frozen_frags:
@@ -196,7 +198,6 @@ def select_aa_block (las, frags1, frags2, fock1):
     gmax = np.argmax (gblk)
     i = frags1[gmax // len (frags2)]
     j = frags2[gmax % len (frags2)]
-    print (i, j, gblk[gmax])
     return i, j
 
 def combine_pair (las, kf1, kf2):

From b43d366d8884cdbc3b362b2fec1ea0a71cba25a8 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 12:42:27 -0500
Subject: [PATCH 33/78] lasscf_async subproblem parameters (#104)

Address subproblem parameters (max_cycle_macro etc.) with new
"impurity_params" and "relax_params" attributes. Use demonstration
in examples/lasscf_async/c2h6n4_lasscf88_sto3g.py.
---
 .../lasscf_async/c2h6n4_lasscf88_sto3g.py     |  7 +++++
 my_pyscf/mcscf/lasci.py                       |  6 +++-
 my_pyscf/mcscf/lasscf_async/combine.py        |  3 ++
 my_pyscf/mcscf/lasscf_async/crunch.py         |  4 +++
 my_pyscf/mcscf/lasscf_async/lasscf_async.py   | 30 +++++++++++++++++++
 5 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py b/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py
index 6c248181..da3fc09c 100644
--- a/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py
+++ b/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py
@@ -15,7 +15,14 @@
                        smults=[[1,1],[3,1],[3,1],[1,3],[1,3]])
 las_syn.kernel (mo)
 print ("Synchronous calculation converged?", las_syn.converged)
+
 las_asyn = asyn.LASSCF (mf, (4,4), ((4,0),(0,4)), spin_sub=(5,5))
+# To fiddle with the optimization parameters of the various subproblems, use
+# the "impurity_params" and "relax_params" dictionaries
+las_asyn.max_cycle_macro = 50 # by default, all subproblems use this
+las_asyn.impurity_params['max_cycle_macro'] = 51 # all fragments
+las_asyn.impurity_params[1]['max_cycle_macro'] = 52 # second fragment only (has priority)
+las_asyn.relax_params['max_cycle_macro'] = 53
 mo = las_asyn.set_fragments_((list (range (3)), list (range (9,12))), mf.mo_coeff)
 las_asyn.state_average_(weights=[1,0,0,0,0],
                         spins=[[0,0],[2,0],[-2,0],[0,2],[0,-2]],
diff --git a/my_pyscf/mcscf/lasci.py b/my_pyscf/mcscf/lasci.py
index 2d11b823..206ff077 100644
--- a/my_pyscf/mcscf/lasci.py
+++ b/my_pyscf/mcscf/lasci.py
@@ -2048,7 +2048,11 @@ def dump_flags (self, verbose=None, _method_name='LASCI'):
         for i, (no, ne) in enumerate (zip (self.ncas_sub, self.nelecas_sub)):
             log.info ('LAS %d : (%de+%de, %do)', i, ne[0], ne[1], no)
         log.info ('nroots = %d', self.nroots)
-        log.info ('max_memory %d (MB)', self.max_memory)
+        log.info ('max_cycle_macro = %d', self.max_cycle_macro)
+        log.info ('max_cycle_micro = %d', self.max_cycle_micro)
+        log.info ('conv_tol_grad = %s', self.conv_tol_grad)
+        log.info ('max_memory %d MB (current use %d MB)', self.max_memory,
+                  lib.current_memory()[0])
         for i, fcibox in enumerate (self.fciboxes):
             if getattr (fcibox, 'dump_flags', None):
                 log.info ('fragment %d FCI solver flags:', i)
diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 770810bf..19421374 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -123,6 +123,9 @@ def relax (las, kf):
     with flas_stdout_env (las, flas_stdout):
         flas = lasci.LASCI (las._scf, las.ncas_sub, las.nelecas_sub)
         flas.__dict__.update (las.__dict__)
+        params = getattr (las, 'relax_params', {})
+        glob = {key: val for key, val in params.items () if isinstance (key, str)}
+        flas.__dict__.update (glob)
         e_tot, e_cas, ci, mo_coeff, mo_energy, h2eff_sub, veff = \
             flas.kernel (kf.mo_coeff, ci0=kf.ci)
     ovlp = mo_coeff.conj ().T @ las._scf.get_ovlp () @ mo_coeff
diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 8e107c55..d4637233 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -811,6 +811,10 @@ def get_impurity_casscf (las, ifrag, imporb_builder=None):
     if imporb_builder is not None:
         imporb_builder.log = logger.new_logger (imc, imc.verbose)
     imc._imporb_builder = imporb_builder
+    params = getattr (las, 'impurity_params', {})
+    glob = {key: val for key, val in params.items () if isinstance (key, str)}
+    imc.__dict__.update (glob)
+    imc.__dict__.update (params.get (ifrag, {}))
     return imc
 
 if __name__=='__main__':
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index ab446249..fb962729 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -147,6 +147,29 @@ def get_grad (las, mo_coeff=None, ci=None, ugg=None, kf=None):
     return ugg.pack (gorb, gci)
 
 class LASSCFNoSymm (lasci.LASCINoSymm):
+    '''Extra attributes:
+
+    frags_orbs : list of length nfrags of list of integers
+        Identifies the definition of fragments as lists of AOs
+    impurity_params : list of length nfrags of dict
+        Key/value pairs are assigned as attributes of the impurity solver CASSCF object.
+        Use this to address, e.g., conv_tol_grad, max_cycle_macro, etc. of the impurity
+        subproblems
+    relax_params : dict
+        Key/value pairs are assigned as attributes to the active-active relaxation (``LASCI'')
+        subproblem, similar to impurity_params. Use this to, e.g., set a different max_cycle_macro
+        for the ``LASCI'' step.
+    '''
+    def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, **kwargs):
+        lasci.LASCINoSymm.__init__(self, mf, ncas, nelecas, ncore=ncore, spin_sub=spin_sub,
+                                   **kwargs)
+        self.impurity_params = {}
+        for i in range (self.nfrags):
+            self.impurity_params[i] = {}
+        self.relax_params = {}
+        keys = set (('frags_orbs','impurity_params','relax_params'))
+        self._keys = self._keys.union (keys)
+
     _ugg = lasscf_sync_o0.LASSCF_UnitaryGroupGenerators
     _kern = kernel
     get_grad = get_grad
@@ -204,6 +227,13 @@ def _finalize(self):
         return
 
 class LASSCFSymm (lasci.LASCISymm):
+    def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, **kwargs):
+        lasci.LASCISymm.__init__(self, mf, ncas, nelecas, ncore=ncore, spin_sub=spin_sub, **kwargs)
+        self.impurity_params = [{} for i in range (self.nfrags)]
+        self.relax_params = {}
+        keys = set (('frags_orbs','impurity_params','relax_params'))
+        self._keys = self._keys.union (keys)
+
     _ugg = lasscf_sync_o0.LASSCFSymm_UnitaryGroupGenerators
     _kern = kernel
     _finalize = LASSCFNoSymm._finalize

From d5ddf4081f683c29779d5b387206e3ba0b2c5f89 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 13:44:11 -0500
Subject: [PATCH 34/78] better relax tol default

---
 my_pyscf/mcscf/lasscf_async/combine.py      | 26 +++++++++++++--------
 my_pyscf/mcscf/lasscf_async/lasscf_async.py | 24 +++++++++++++++++++
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 3ff42f16..f7195370 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -110,9 +110,10 @@ def __exit__(self, type, value, traceback):
         if getattr (self.las, 'with_df', None):
             self.las.with_df.stdout = self.las_stdout
 
-def relax (las, kf, freeze_inactive=False, frozen_frags=None):
-    # TODO: bottom-up 2-frag subproblem reimplementation
-    if frozen_frags is None: frozen_frags = []
+def relax (las, kf, freeze_inactive=False, unfrozen_frags=None):
+    if unfrozen_frags is None: frozen_frags = []
+    else:
+        frozen_frags = [i for i in range (las.nfrags) if i not in unfrozen_frags]
     log = lib.logger.new_logger (las, las.verbose)
     flas_stdout = getattr (las, '_flas_stdout', None)
     if flas_stdout is None:
@@ -131,13 +132,8 @@ def relax (las, kf, freeze_inactive=False, frozen_frags=None):
     with flas_stdout_env (las, flas_stdout):
         flas = lasci.LASCI (las._scf, las.ncas_sub, las.nelecas_sub)
         flas.__dict__.update (las.__dict__)
-        params = getattr (las, 'relax_params', {})
-        glob = {key: val for key, val in params.items () if isinstance (key, str)}
-        flas.__dict__.update (glob)
         flas.frozen = []
         flas.frozen_ci = frozen_frags
-        # TODO: ensure robust tolerance selection so things always make progress
-        flas.min_cycle_macro = 1
         if freeze_inactive:
             flas.frozen.extend (list (range (las.ncore)))
         for ifrag in frozen_frags:
@@ -148,6 +144,17 @@ def relax (las, kf, freeze_inactive=False, frozen_frags=None):
             nocc = las.ncore + las.ncas
             nmo = kf.mo_coeff.shape[1]
             flas.frozen.extend (list (range (nocc,nmo)))
+        # Default: scale down conv_tol_grad according to size of subproblem
+        scale = np.sqrt (flas.get_ugg ().nvar_tot / las.get_ugg ().nvar_tot)
+        flas.conv_tol_grad = scale * las.conv_tol_grad
+        flas.min_cycle_macro = 1
+        params = getattr (las, 'relax_params', {})
+        glob = {key: val for key, val in params.items () if isinstance (key, str)}
+        glob = {key: val for key, val in glob.items () if key not in ('frozen', 'frozen_ci')}
+        flas.__dict__.update (glob)
+        loc = params.get (tuple (unfrozen_frags), {})
+        loc = {key: val for key, val in loc.items () if key not in ('frozen', 'frozen_ci')}
+        flas.__dict__.update (loc)
         e_tot, e_cas, ci, mo_coeff, mo_energy, h2eff_sub, veff = \
             flas.kernel (kf.mo_coeff, ci0=kf.ci)
     ovlp = mo_coeff.conj ().T @ las._scf.get_ovlp () @ mo_coeff
@@ -208,8 +215,7 @@ def combine_pair (las, kf1, kf2):
     between the fragments assigned to each with the inactive and virtual orbitals frozen.'''
     kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf1)
     i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
-    frozen = [k for k in range (las.nfrags) if k not in (i,j)]
-    kf3 = relax (las, kf3, freeze_inactive=True, frozen_frags=frozen)
+    kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
     kf3.frags = kf1.frags.union (kf2.frags)
     return kf3
 
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 37400803..45f444a4 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -145,6 +145,22 @@ def get_grad (las, mo_coeff=None, ci=None, ugg=None, kf=None):
                            veff=veff)
     return ugg.pack (gorb, gci)
 
+class SortedIndexDict (dict):
+    '''A dict, but all keys that are tuples are sorted so that, for instance, (1,2) is always
+    the same as (2,1)'''
+    def __setitem__(self, key, val):
+        if isinstance (key, tuple): key = tuple (sorted (key))
+        dict.__setitem__(self, key, val)
+    def __getitem__(self, key):
+        if isinstance (key, tuple): key = tuple (sorted (key))
+        return dict.__getitem__(self, key)
+    def get (self, key, *args):
+        if isinstance (key, tuple): key = tuple (sorted (key))
+        if len (args):
+            return dict.get (self, key, *args)
+        else:
+            return dict.get (self, key)
+
 class LASSCFNoSymm (lasci.LASCINoSymm):
     '''Extra attributes:
 
@@ -169,6 +185,14 @@ def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, **kwargs):
         keys = set (('frags_orbs','impurity_params','relax_params'))
         self._keys = self._keys.union (keys)
 
+    @property
+    def relax_params (self): return self._relax_params
+    @relax_params.setter
+    def relax_params (self, d):
+        self._relax_params = SortedIndexDict ()
+        for key, val in d.items ():
+            self._relax_params[key] = val
+
     _ugg = lasscf_sync_o0.LASSCF_UnitaryGroupGenerators
     _kern = kernel
     get_grad = get_grad

From 0a85587f0b863e07b22be4243eca893087f190fe Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 13:54:26 -0500
Subject: [PATCH 35/78] test tol fiddle; add examples

---
 .../c2h4n4_equil_lasscf1010_631g.py             | 17 +++++++++++++++++
 .../lasscf_async/c2h4n4_str_lasscf1010_631g.py  | 15 +++++++++++++++
 tests/lasscf/test_lasscf_async.py               |  3 ++-
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100755 examples/lasscf_async/c2h4n4_equil_lasscf1010_631g.py
 create mode 100755 examples/lasscf_async/c2h4n4_str_lasscf1010_631g.py

diff --git a/examples/lasscf_async/c2h4n4_equil_lasscf1010_631g.py b/examples/lasscf_async/c2h4n4_equil_lasscf1010_631g.py
new file mode 100755
index 00000000..33011b83
--- /dev/null
+++ b/examples/lasscf_async/c2h4n4_equil_lasscf1010_631g.py
@@ -0,0 +1,17 @@
+from mrh.tests.lasscf.c2h4n4_struct import structure as struct
+from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
+from pyscf.lib import logger
+from pyscf import scf
+
+mol = struct (0.0, 0.0, '6-31g', symmetry=False)
+mol.spin = 0
+mol.verbose = logger.DEBUG
+mol.output = 'c2h4n4_equil_lasscf1010_631g.log'
+mol.build ()
+mf = scf.RHF (mol).run ()
+las = LASSCF (mf, (4,2,4), ((2,2),(1,1),(2,2)), spin_sub=(1,1,1))
+mo_coeff = las.sort_mo ([7,8,16,18,22,23,24,26,33,34])
+mo_coeff = las.set_fragments_([[0,1,2],[3,4,5,6],[7,8,9]], mo_coeff=mo_coeff)
+las.kernel (mo_coeff)
+
+
diff --git a/examples/lasscf_async/c2h4n4_str_lasscf1010_631g.py b/examples/lasscf_async/c2h4n4_str_lasscf1010_631g.py
new file mode 100755
index 00000000..09b8f751
--- /dev/null
+++ b/examples/lasscf_async/c2h4n4_str_lasscf1010_631g.py
@@ -0,0 +1,15 @@
+from mrh.tests.lasscf.c2h4n4_struct import structure as struct
+from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
+from pyscf.lib import logger
+from pyscf import scf
+
+mol = struct (2.0, 2.0, '6-31g', symmetry=False)
+mol.spin = 8
+mol.verbose = logger.DEBUG
+mol.output = 'c2h4n4_str_lasscf1010_631g.log'
+mol.build ()
+mf = scf.RHF (mol).run ()
+las = LASSCF (mf, (4,2,4), ((2,2),(1,1),(2,2)), spin_sub=(1,1,1))
+mo_coeff = las.set_fragments_([[0,1,2],[3,4,5,6],[7,8,9]])
+las.kernel (mo_coeff)
+
diff --git a/tests/lasscf/test_lasscf_async.py b/tests/lasscf/test_lasscf_async.py
index f9678c79..d9e686a1 100644
--- a/tests/lasscf/test_lasscf_async.py
+++ b/tests/lasscf/test_lasscf_async.py
@@ -29,6 +29,7 @@ def tearDownModule():
 
 def _run_mod (mod):
     las=mod.LASSCF(mf, (2,2), (2,2))
+    las.conv_tol_grad = 1e-6
     localize_fn = getattr (las, 'set_fragments_', las.localize_init_guess)
     mo_coeff=localize_fn (frag_atom_list, mo0)
     las.state_average_(weights=[.2,]*5,
@@ -47,7 +48,7 @@ def test_implementations (self):
         with self.subTest ('asynchronous calculation converged'):
             self.assertTrue (las_asyn.converged)
         with self.subTest ('average energy'):
-            self.assertAlmostEqual (las_syn.e_tot, las_asyn.e_tot, 8)
+            self.assertAlmostEqual (las_syn.e_tot, las_asyn.e_tot, 7)
         for i in range (5):
             with self.subTest ('energy', state=i):
                 self.assertAlmostEqual (las_syn.e_states[i], las_asyn.e_states[i], 6)

From ac2e6b0410d3b972624e69efc7f68613008a0532 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 14:34:14 -0500
Subject: [PATCH 36/78] expose pairwise combine in kernel and cleanup

---
 my_pyscf/mcscf/lasscf_async/combine.py      | 32 ++++-----------------
 my_pyscf/mcscf/lasscf_async/keyframe.py     |  2 ++
 my_pyscf/mcscf/lasscf_async/lasscf_async.py | 27 +++++++++--------
 3 files changed, 20 insertions(+), 41 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index f7195370..6c0fc668 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -169,12 +169,6 @@ def combine_o0 (las, kf2_list):
     kf1 = relax (las, kf1)
     return kf1
 
-def combine_o1 (las, kf2_list):
-    kf1 = kf2_list[0]
-    for kf2 in kf2_list[1:]:
-        kf1 = combine_pair (las, kf1, kf2)
-    return kf1
-
 def select_aa_block (las, frags1, frags2, fock1):
     '''Identify from two lists of candidate fragments the single active-active orbital-rotation
     gradient block with the largest norm
@@ -210,34 +204,18 @@ def select_aa_block (las, frags1, frags2, fock1):
     j = frags2[gmax % len (frags2)]
     return i, j
 
-def combine_pair (las, kf1, kf2):
+def combine_pair (las, kf1, kf2, kf_ref=None):
     '''Combine two keyframes and relax one specific block of active-active orbital rotations
     between the fragments assigned to each with the inactive and virtual orbitals frozen.'''
-    kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf1)
+    if kf_ref is None: kf_ref=kf1
+    kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf_ref)
     i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
     kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
     kf3.frags = kf1.frags.union (kf2.frags)
     return kf3
 
-def impweights (las, mo_coeff, impurities):
-    '''Compute the weights of each MO in mo_coeff on the various impurities.
-
-    Args:
-        las : object of :class:`LASCINoSymm`
-        mo_coeff : ndarray of shape (nao,nmo)
-        impurities: list of length nfrag of objects of :class:`ImpurityCASSCF`
-
-    Returns:
-        weights: ndarray of shape (nmo, nfrag)
-    '''
-    smoH = mo_coeff.conj ().T @ las._scf.get_ovlp ()
-    weights = []
-    for imp in impurities:
-        a = smoH @ imp.mol.get_imporb_coeff ()
-        weights.append ((a @ a.conj ().T).diagonal ())
-    return np.stack (weights, axis=1)
-
-def combine_o1_rigid (las, kf1, kf2, kf_ref):
+# Function from failed algorithm. Retained for reference
+def combine_o1_kappa_rigid (las, kf1, kf2, kf_ref):
     '''Combine two keyframes (without relaxing the active orbitals) by weighting the kappa matrices
     with respect to a third reference keyframe democratically
 
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 2687984b..03119843 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -204,6 +204,7 @@ def _count (lbl, i, j):
 
     return ncommon_core, ncommon_active, ncommon_virt
 
+# Function from failed algorithm. May have a future use.
 def get_kappa (las, kf1, kf2):
     '''Decompose unitary matrix of orbital rotations between two keyframes as
 
@@ -312,6 +313,7 @@ def get_kappa (las, kf1, kf2):
 
     return kappa, rmat
 
+# Function from failed algorithm. May have a future use.
 def democratic_matrix (las, mat, frags, mo_coeff):
     '''Weight a matrix in the "democratic DMET" way
 
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 45f444a4..9cceded6 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -56,20 +56,19 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             impurity.kernel ()
             kf2_list.append (impurity._push_keyframe (kf1))
 
-        # EXPERIMENTAL: examining differences in keyframes
-        for i in range (len (kf2_list)):
-            kfi = kf2_list[i]
-            log.info ('Comparing reference keyframe to fragment %d', i)
-            keyframe.count_common_orbitals (las, kf1, kfi)
-            keyframe.get_kappa (las, kf1, kfi)
-        for i, j in itertools.combinations (range (len (kf2_list)), 2):
-            kfi, kfj = kf2_list[i], kf2_list[j]
-            log.info ('Comparing keyframes for fragments %d and %d:', i, j)
-            keyframe.count_common_orbitals (las, kfi, kfj)
-            keyframe.get_kappa (las, kfi, kfj)
-
-        # 3. Combine from fragments. TODO: smaller chunks instead of one whole-molecule function
-        kf1 = combine.combine_o1 (las, kf2_list)
+        # 3. Combine from fragments. It should not be necessary to do this in any particular order,
+        #    and it should be possible to do March Madness tournament style; e.g.:
+        #
+        #       kf2_list[0] --- kf2_list[1]     kf2_list[2] --- kf2_list[3]
+        #                    |                               |
+        #                   kfi --------------------------- kfj
+        #                                    |
+        #                                   kf2
+        #
+        kf2 = kf2_list[0]
+        for kf3 in kf2_list[1:]:
+            kf2 = combine.combine_pair (las, kf2, kf3, kf_ref=kf1)
+        kf1 = kf2
 
         # Evaluate status and break if converged
         e_tot = las.energy_nuc () + las.energy_elec (

From 25b117d1cada86d991afa1f16f1bf38dcdfba669 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 14:50:06 -0500
Subject: [PATCH 37/78] test tol fiddle

I refuse to set a test tolerance to 0.1 mEh. I have to be able to
do better than that.
---
 tests/lasscf/test_lasscf_async.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lasscf/test_lasscf_async.py b/tests/lasscf/test_lasscf_async.py
index d9e686a1..b25a7db4 100644
--- a/tests/lasscf/test_lasscf_async.py
+++ b/tests/lasscf/test_lasscf_async.py
@@ -29,7 +29,7 @@ def tearDownModule():
 
 def _run_mod (mod):
     las=mod.LASSCF(mf, (2,2), (2,2))
-    las.conv_tol_grad = 1e-6
+    las.conv_tol_grad = 1e-7
     localize_fn = getattr (las, 'set_fragments_', las.localize_init_guess)
     mo_coeff=localize_fn (frag_atom_list, mo0)
     las.state_average_(weights=[.2,]*5,

From 71cb238ba07610f8ba203b721fee8817ed96e0f3 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 15:56:01 -0500
Subject: [PATCH 38/78] Guardrail against combine_pair misuse

Raise an exception if trying to combine two keyframes that are
responsible for the same fragment
---
 my_pyscf/mcscf/lasscf_async/combine.py  |  4 ++++
 my_pyscf/mcscf/lasscf_async/keyframe.py | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 6c0fc668..421fdc39 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -208,6 +208,10 @@ def combine_pair (las, kf1, kf2, kf_ref=None):
     '''Combine two keyframes and relax one specific block of active-active orbital rotations
     between the fragments assigned to each with the inactive and virtual orbitals frozen.'''
     if kf_ref is None: kf_ref=kf1
+    if len (kf1.frags.intersection (kf2.frags)):
+        errstr = ("Cannot combine keyframes that are responsible for the same fragments "
+                  "({} {})").format (kf1.frags, kf2.frags)
+        raise RuntimeError (errstr)
     kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf_ref)
     i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
     kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index 03119843..d7c96f8e 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -349,5 +349,24 @@ def democratic_matrix (las, mat, frags, mo_coeff):
 
     return u @ mat @ u.conj ().T
 
+# Thought I might need this; realize I don't. Might still be useful later.
+def fock_cycle (las, kf1):
+    '''For the inactive-virtual orbital rotations only, build and diagonalize the fock
+    matrix once'''
+    nao, nmo = kf1.mo_coeff.shape
+    ncore, ncas = las.ncore, las.ncas
+    nocc = ncore + ncas
+    nvirt = nmo - nocc
+    mo = np.append (kf1.mo_coeff[:,:ncore], kf1.mo_coeff[:,nocc:])
+    if not mo.shape[1]: return kf1
+    kf2 = kf1.copy ()
+    fock = las.get_hcore ()[None,:,:] + kf1.veff
+    fock = get_roothaan_fock (fock, kf1.dm1s, las._scf.get_ovlp())
+    orbsym = None # TODO: symmetry
+    fock = mo.conj ().T @ fock @ mo
+    ene, umat = las._eig (fock, 0, 0, orbsym)
+    if ncore: kf2.mo_coeff[:,:ncore] = mo @ umat[:,:ncore]
+    if nvirt: kf2.mo_coeff[:,nocc:] = mo @ umat[:,ncore:]
+    return kf2
 
 

From ba9f221af9477982c8ced5d8c7023ef96d2d9c2e Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 16:25:22 -0500
Subject: [PATCH 39/78] lasscf_async old kernel option and example

---
 examples/lasscf_async/using_older_kernel.py   |  27 +++++
 my_pyscf/mcscf/lasscf_async/combine.py        |   7 +-
 .../mcscf/lasscf_async/old_aa_sync_kernel.py  | 110 ++++++++++++++++++
 3 files changed, 141 insertions(+), 3 deletions(-)
 create mode 100755 examples/lasscf_async/using_older_kernel.py
 create mode 100644 my_pyscf/mcscf/lasscf_async/old_aa_sync_kernel.py

diff --git a/examples/lasscf_async/using_older_kernel.py b/examples/lasscf_async/using_older_kernel.py
new file mode 100755
index 00000000..b04df8b2
--- /dev/null
+++ b/examples/lasscf_async/using_older_kernel.py
@@ -0,0 +1,27 @@
+from mrh.tests.lasscf.c2h4n4_struct import structure as struct
+from mrh.my_pyscf.mcscf.lasscf_async import LASSCF
+from pyscf.lib import logger
+from pyscf import scf
+
+mol = struct (0.0, 0.0, '6-31g', symmetry=False)
+mol.spin = 0
+mol.verbose = logger.DEBUG
+mol.output = 'using_older_kernel.log'
+mol.build ()
+mf = scf.RHF (mol).run ()
+las = LASSCF (mf, (4,2,4), ((2,2),(1,1),(2,2)), spin_sub=(1,1,1))
+mo_coeff = las.sort_mo ([7,8,16,18,22,23,24,26,33,34])
+mo_coeff = las.set_fragments_([[0,1,2],[3,4,5,6],[7,8,9]], mo_coeff=mo_coeff)
+
+# Note that just importing the patch_kernel function doesn't do anything, unlike the gpu4pyscf
+# "patch_*" functions. I prefer not to do things in imports and I hate global variables, so
+# instead, patch_kernel is a function that returns a patched version of that specific method
+# instance.
+from mrh.my_pyscf.mcscf.lasscf_async import old_aa_sync_kernel
+las = old_aa_sync_kernel.patch_kernel (las)
+
+# This will take fewer macrocycles to converge than c2h4n4_equil_lasscf1010_631g, to which it is
+# otherwise identical.
+las.kernel (mo_coeff)
+
+
diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 421fdc39..13234284 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -152,9 +152,10 @@ def relax (las, kf, freeze_inactive=False, unfrozen_frags=None):
         glob = {key: val for key, val in params.items () if isinstance (key, str)}
         glob = {key: val for key, val in glob.items () if key not in ('frozen', 'frozen_ci')}
         flas.__dict__.update (glob)
-        loc = params.get (tuple (unfrozen_frags), {})
-        loc = {key: val for key, val in loc.items () if key not in ('frozen', 'frozen_ci')}
-        flas.__dict__.update (loc)
+        if unfrozen_frags is not None:
+            loc = params.get (tuple (unfrozen_frags), {})
+            loc = {key: val for key, val in loc.items () if key not in ('frozen', 'frozen_ci')}
+            flas.__dict__.update (loc)
         e_tot, e_cas, ci, mo_coeff, mo_energy, h2eff_sub, veff = \
             flas.kernel (kf.mo_coeff, ci0=kf.ci)
     ovlp = mo_coeff.conj ().T @ las._scf.get_ovlp () @ mo_coeff
diff --git a/my_pyscf/mcscf/lasscf_async/old_aa_sync_kernel.py b/my_pyscf/mcscf/lasscf_async/old_aa_sync_kernel.py
new file mode 100644
index 00000000..a184c7f4
--- /dev/null
+++ b/my_pyscf/mcscf/lasscf_async/old_aa_sync_kernel.py
@@ -0,0 +1,110 @@
+# This is the original lasscf_async kernel, used prior to July 2024, which synchronously optimized
+# the active-orbital--active-orbital rotation degrees of freedom and required all impurity problems
+# to finish before combining them.
+
+import itertools
+import numpy as np
+from scipy import linalg
+from pyscf import lib
+from mrh.my_pyscf.mcscf.lasscf_async import keyframe, combine
+from mrh.my_pyscf.mcscf.lasscf_async.split import get_impurity_space_constructor
+from mrh.my_pyscf.mcscf.lasscf_async.crunch import get_impurity_casscf
+
+def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
+            assert_no_dupes=False, verbose=lib.logger.NOTE, frags_orbs=None,
+            **kwargs):
+    if mo_coeff is None: mo_coeff = las.mo_coeff
+    if assert_no_dupes: las.assert_no_duplicates ()
+    h2eff_sub = las.get_h2eff (mo_coeff)
+    if (ci0 is None or any ([c is None for c in ci0]) or
+      any ([any ([c2 is None for c2 in c1]) for c1 in ci0])):
+        ci0 = las.get_init_guess_ci (mo_coeff, h2eff_sub, ci0)
+    if (ci0 is None or any ([c is None for c in ci0]) or
+      any ([any ([c2 is None for c2 in c1]) for c1 in ci0])):
+        raise RuntimeError ("failed to populate get_init_guess")
+    if frags_orbs is None: frags_orbs = getattr (las, 'frags_orbs', None)
+    imporb_builders = [get_impurity_space_constructor (las, i, frag_orbs=frag_orbs)
+                       for i, frag_orbs in enumerate (frags_orbs)]
+    nfrags = len (las.ncas_sub)
+    log = lib.logger.new_logger(las, verbose)
+    t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
+    kf0 = las.get_keyframe (mo_coeff, ci0) 
+    las._flas_stdout = None # TODO: more elegant model for this
+
+    ###############################################################################################
+    ################################## Begin actual kernel logic ##################################
+    ###############################################################################################
+
+
+
+
+
+    converged = False
+    it = 0
+    kf1 = kf0
+    impurities = [get_impurity_casscf (las, i, imporb_builder=builder)
+                  for i, builder in enumerate (imporb_builders)]
+    ugg = las.get_ugg ()
+    t1 = log.timer_debug1 ('impurity solver construction', *t0)
+    # GRAND CHALLENGE: replace rigid algorithm below with dynamic task scheduling
+    for it in range (las.max_cycle_macro):
+        # 1. Divide into fragments
+        for impurity in impurities: impurity._pull_keyframe_(kf1)
+
+        # 2. CASSCF on each fragment
+        kf2_list = []
+        for impurity in impurities:
+            impurity.kernel ()
+            kf2_list.append (impurity._push_keyframe (kf1))
+
+        # 3. Combine from fragments. TODO: smaller chunks instead of one whole-molecule function
+        kf1 = combine.combine_o0 (las, kf2_list)
+
+        # Evaluate status and break if converged
+        e_tot = las.energy_nuc () + las.energy_elec (
+            mo_coeff=kf1.mo_coeff, ci=kf1.ci, h2eff=kf1.h2eff_sub, veff=kf1.veff)
+        gvec = las.get_grad (ugg=ugg, kf=kf1)
+        norm_gvec = linalg.norm (gvec)
+        log.info ('LASSCF macro %d : E = %.15g ; |g| = %.15g', it, e_tot, norm_gvec)
+        t1 = log.timer ('one LASSCF macro cycle', *t1)
+        las.dump_chk (mo_coeff=kf1.mo_coeff, ci=kf1.ci)
+        if norm_gvec < conv_tol_grad:
+            converged = True
+            break
+
+
+
+
+
+    ###############################################################################################
+    ################################### End actual kernel logic ###################################
+    ###############################################################################################
+
+    if getattr (las, '_flas_stdout', None) is not None: las._flas_stdout.close ()
+    # TODO: more elegant model for this
+    mo_coeff, ci1, h2eff_sub, veff = kf1.mo_coeff, kf1.ci, kf1.h2eff_sub, kf1.veff
+    t1 = log.timer ('LASSCF {} macrocycles'.format (it), *t0)
+    e_tot = las.energy_nuc () + las.energy_elec (mo_coeff=mo_coeff, ci=ci1, h2eff=h2eff_sub,
+                                                 veff=veff)
+    e_states = las.energy_nuc () + np.array (las.states_energy_elec (mo_coeff=mo_coeff, ci=ci1,
+                                                                     h2eff=h2eff_sub, veff=veff))
+    # This crap usually goes in a "_finalize" function
+    log.info ('LASSCF %s after %d cycles', ('not converged', 'converged')[converged], it+1)
+    log.info ('LASSCF E = %.15g ; |g| = %.15g', e_tot,
+              norm_gvec)
+    t1 = log.timer ('LASSCF final energy', *t1)
+    mo_coeff, mo_energy, mo_occ, ci1, h2eff_sub = las.canonicalize (mo_coeff, ci1, veff=veff,
+                                                                    h2eff_sub=h2eff_sub)
+    t1 = log.timer ('LASSCF canonicalization', *t1)
+    t0 = log.timer ('LASSCF kernel function', *t0)
+
+    e_cas = None # TODO: get rid of this worthless, meaningless variable
+    return converged, e_tot, e_states, mo_energy, mo_coeff, e_cas, ci1, h2eff_sub, veff
+
+
+def patch_kernel (las):
+    class PatchedLAS (las.__class__):
+        _kern = kernel
+    return lib.view (las, PatchedLAS)
+
+

From c3f9baf32cc494ac7a09f0c0818840e623bad719 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 18 Jul 2024 17:42:03 -0500
Subject: [PATCH 40/78] relax_params pairwise assignment and example

---
 examples/lasscf_async/c2h6n4_lasscf88_sto3g.py | 13 ++++++++++++-
 my_pyscf/mcscf/lasscf_async/lasscf_async.py    |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py b/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py
index da3fc09c..290072f2 100644
--- a/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py
+++ b/examples/lasscf_async/c2h6n4_lasscf88_sto3g.py
@@ -2,6 +2,7 @@
 from mrh.tests.lasscf.c2h6n4_struct import structure as struct
 from mrh.my_pyscf.mcscf import lasscf_sync_o0 as syn
 from mrh.my_pyscf.mcscf import lasscf_async as asyn
+from mrh.my_pyscf.mcscf.lasscf_async import old_aa_sync_kernel
 
 mol = struct (1.0, 1.0, 'sto-3g', symmetry=False)
 mol.verbose = 5
@@ -22,7 +23,17 @@
 las_asyn.max_cycle_macro = 50 # by default, all subproblems use this
 las_asyn.impurity_params['max_cycle_macro'] = 51 # all fragments
 las_asyn.impurity_params[1]['max_cycle_macro'] = 52 # second fragment only (has priority)
-las_asyn.relax_params['max_cycle_macro'] = 53
+las_asyn.relax_params['max_cycle_macro'] = 53 # "flas", the "LASCI step"
+# If you have more than two fragments, you can apply specific parameters to orbital relaxations
+# between specific pairs of fragments. Addressing specific fragment pairs has priority over
+# the global settings above.
+las_asyn.relax_params['max_cycle_micro'] = 6 # loses
+las_asyn.relax_params[(0,1)]['max_cycle_micro'] = 7 # wins
+# However, the old_aa_sync_kernel doesn't relax the active orbitals in a pairwise way, so stuff like
+# "relax_params[(0,1)]" is ignored if we patch in the old kernel:
+# 
+# las_asyn = old_aa_sync_kernel.patch_kernel (las_asyn) # uncomment me to make 6 win
+
 mo = las_asyn.set_fragments_((list (range (3)), list (range (9,12))), mf.mo_coeff)
 las_asyn.state_average_(weights=[1,0,0,0,0],
                         spins=[[0,0],[2,0],[-2,0],[0,2],[0,-2]],
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 9cceded6..3c47455b 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -181,6 +181,8 @@ def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, **kwargs):
         for i in range (self.nfrags):
             self.impurity_params[i] = {}
         self.relax_params = {}
+        for i, j in itertools.combinations (range (self.nfrags), 2):
+            self.relax_params[(i,j)] = {}
         keys = set (('frags_orbs','impurity_params','relax_params'))
         self._keys = self._keys.union (keys)
 

From 43278d3db119e84bdfd83ac4f56900128b21c53c Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 22 Jul 2024 11:54:47 -0500
Subject: [PATCH 41/78] combine_pair separate logfiles for diff pairs

Instead of one giant "flas" file
---
 my_pyscf/mcscf/lasscf_async/combine.py      | 14 ++++++++++----
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  4 ++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 13234284..fe75d06f 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -111,22 +111,28 @@ def __exit__(self, type, value, traceback):
             self.las.with_df.stdout = self.las_stdout
 
 def relax (las, kf, freeze_inactive=False, unfrozen_frags=None):
-    if unfrozen_frags is None: frozen_frags = []
+    flas_stdout = getattr (las, '_flas_stdout', None)
+    if unfrozen_frags is None:
+        frozen_frags = []
+        flas_tail = '.flas'
     else:
+        unfrozen_frags = tuple (sorted (unfrozen_frags)) # sorted
         frozen_frags = [i for i in range (las.nfrags) if i not in unfrozen_frags]
+        flas_stdout = flas_stdout.get (unfrozen_frags, None)
+        flas_tail = '.' + '.'.join ([str (s) for s in unfrozen_frags])
     log = lib.logger.new_logger (las, las.verbose)
-    flas_stdout = getattr (las, '_flas_stdout', None)
     if flas_stdout is None:
         output = getattr (las.mol, 'output', None)
         if not ((output is None) or (output=='/dev/null')):
-            flas_output = output + '.flas'
+            flas_output = output + flas_tail
             if las.verbose > lib.logger.QUIET:
                 if os.path.isfile (flas_output):
                     print('overwrite output file: %s' % flas_output)
                 else:
                     print('output file: %s' % flas_output)
             flas_stdout = open (flas_output, 'w')
-            las._flas_stdout = flas_stdout
+            if unfrozen_frags is None: las._flas_stdout = flas_stdout
+            else: las._flas_stdout[unfrozen_frags] = flas_stdout
         else:
             flas_stdout = las.stdout
     with flas_stdout_env (las, flas_stdout):
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 3c47455b..cf92c6aa 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -28,7 +28,7 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
     log = lib.logger.new_logger(las, verbose)
     t0 = (lib.logger.process_clock(), lib.logger.perf_counter())
     kf0 = las.get_keyframe (mo_coeff, ci0) 
-    las._flas_stdout = None # TODO: more elegant model for this
+    las._flas_stdout = {} # TODO: more elegant model for this
 
     ###############################################################################################
     ################################## Begin actual kernel logic ##################################
@@ -90,7 +90,7 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
     ################################### End actual kernel logic ###################################
     ###############################################################################################
 
-    if getattr (las, '_flas_stdout', None) is not None: las._flas_stdout.close ()
+    for key, val in las._flas_stdout.items (): val.close ()
     # TODO: more elegant model for this
     mo_coeff, ci1, h2eff_sub, veff = kf1.mo_coeff, kf1.ci, kf1.h2eff_sub, kf1.veff
     t1 = log.timer ('LASSCF {} macrocycles'.format (it), *t0)

From 335b39cf92732f4fb3bf6acb90f3b3997c90e664 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 22 Jul 2024 14:00:09 -0500
Subject: [PATCH 42/78] PySCF compatibility check

---
 pyscf-forge_version.txt | 2 +-
 pyscf_version.txt       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyscf-forge_version.txt b/pyscf-forge_version.txt
index 9dd197fb..cadad9b7 100644
--- a/pyscf-forge_version.txt
+++ b/pyscf-forge_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf-forge.git@8d764a0868b80fbfa70c1a956eab23ec3fdc8494
+git+https://github.com/pyscf/pyscf-forge.git@039ba178d9327f96d1ba401fec21d2813c2dca12
diff --git a/pyscf_version.txt b/pyscf_version.txt
index d45effe2..1ff9d86f 100644
--- a/pyscf_version.txt
+++ b/pyscf_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf.git@beb7b1bcb40dec578392322d20126826f2d3e6ad
+git+https://github.com/pyscf/pyscf.git@bf0b1db22556a3c1b4c34426ea8627e636c1b096

From 0357ce392d09b02967bcfda4be24eb8c90154152 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 23 Jul 2024 12:06:09 -0500
Subject: [PATCH 43/78] lasscf async "march_madness" combination cycle

---
 my_pyscf/mcscf/lasscf_async/lasscf_async.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index cf92c6aa..44b261dc 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -57,7 +57,7 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
             kf2_list.append (impurity._push_keyframe (kf1))
 
         # 3. Combine from fragments. It should not be necessary to do this in any particular order,
-        #    and it should be possible to do March Madness tournament style; e.g.:
+        #    and the below does it March Madness tournament style; e.g.:
         #
         #       kf2_list[0] --- kf2_list[1]     kf2_list[2] --- kf2_list[3]
         #                    |                               |
@@ -65,10 +65,18 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
         #                                    |
         #                                   kf2
         #
-        kf2 = kf2_list[0]
-        for kf3 in kf2_list[1:]:
-            kf2 = combine.combine_pair (las, kf2, kf3, kf_ref=kf1)
-        kf1 = kf2
+        nkf = len (kf2_list)
+        ncyc = int (np.ceil (np.log2 (nkf)))
+        for i in range (int (np.ceil (np.log2 (nkf)))):
+            nkfi = len (kf2_list)
+            kf3_list = []
+            for kf2, kf3 in zip (kf2_list[::2],kf2_list[1::2]):
+                kf3_list.append (combine.combine_pair (las, kf2, kf3, kf_ref=kf1))
+            if nkfi%2: kf3_list.insert (len(kf3_list)-1, kf2_list[-1])
+            # Insert this at second-to-last position so that it gets "mixed in" next cycle
+            kf2_list = kf3_list
+        assert (len (kf2_list) == 1)
+        kf1 = kf2_list[0]
 
         # Evaluate status and break if converged
         e_tot = las.energy_nuc () + las.energy_elec (

From de7d4ec46d0dee7f614f2b836278bc672f0fb860 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 23 Jul 2024 12:53:01 -0500
Subject: [PATCH 44/78] separate ImpurityCASSCF into two classes

anticipating forthcoming generalization
---
 my_pyscf/mcscf/lasscf_async/crunch.py | 57 ++++++++++++++++-----------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index c6a48c91..486fbcc3 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -324,13 +324,7 @@ def casci_kernel(casci, mo_coeff=None, ci0=None, verbose=logger.NOTE, envs=None)
     return e_tot, e_cas, fcivec
 
 # This is the really tricky part
-class ImpurityCASSCF (mcscf.mc1step.CASSCF):
-
-    # make sure the fcisolver flag dump goes to the fragment output file,
-    # not the main output file
-    def dump_flags (self, verbose=None):
-        with lib.temporary_env (self.fcisolver, stdout=self.stdout):
-            mcscf.mc1step.CASSCF.dump_flags(self, verbose=verbose)
+class ImpuritySolver ():
 
     def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         '''Generate the whole-system MO and CI vectors corresponding to the current state of this
@@ -354,18 +348,20 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
         if ci is None: ci=self.ci
         log = logger.new_logger (self, self.verbose)
         kf2 = kf1.copy ()
-        kf2.frags = set ([self._ifrag,])
+        kf2.frags = set (self._ifrags)
         imporb_coeff = self.mol.get_imporb_coeff ()
         mo_self = imporb_coeff @ mo_coeff
         las = self.mol._las
 
         # active orbital part should be easy
-        kf2.ci[self._ifrag] = self.ci
-        i = las.ncore + sum (las.ncas_sub[:self._ifrag])
-        j = i + las.ncas_sub[self._ifrag]
-        k = self.ncore
-        l = k + self.ncas
-        kf2.mo_coeff[:,i:j] = mo_self[:,k:l]
+        ci = self.ci if len (self._ifrags)>1 else [self.ci,]
+        for ix, ifrag in enumerate (self._ifrags):
+            kf2.ci[ifrag] = ci[ix]
+            i = las.ncore + sum (las.ncas_sub[:ifrag])
+            j = i + las.ncas_sub[ifrag]
+            k = self.ncore
+            l = k + self.ncas
+            kf2.mo_coeff[:,i:j] = mo_self[:,k:l]
 
         # Unentangled inactive orbitals
         s0 = las._scf.get_ovlp ()
@@ -452,14 +448,16 @@ def _update_space_(self, imporb_coeff, nelec_imp):
     def _update_trial_state_(self, mo_coeff, ci, veff, dm1s):
         '''Project whole-molecule MO coefficients and CI vectors into the
         impurity space and store on self.mo_coeff; self.ci.'''
-        _ifrag = self._ifrag
         las = self.mol._las
         mf = las._scf
         log = logger.new_logger(self, self.verbose)
 
+        ci = [ci[ifrag] for ifrag in self._ifrags]
+        if len (self._ifrags)==1: ci = ci[0]
+        self.ci = ci
+
         # Project mo_coeff and ci keyframe into impurity space and cache
         imporb_coeff = self.mol.get_imporb_coeff ()
-        self.ci = ci[_ifrag]
         # Inactive orbitals
         mo_core = mo_coeff[:,:las.ncore]
         s0 = mf.get_ovlp ()
@@ -472,9 +470,12 @@ def _update_trial_state_(self, mo_coeff, ci, veff, dm1s):
             log.warn ("pull_keyframe imporb problem: <i|P_emb|i> = %e", evals[idx])
         # Active and virtual orbitals (note self.ncas must be set at construction)
         nocc = self.ncore + self.ncas
-        i = las.ncore + sum (las.ncas_sub[:_ifrag])
-        j = i + las.ncas_sub[_ifrag]
-        mo_las = mo_coeff[:,i:j]
+        mo_las = []
+        for ifrag in self._ifrags:
+            i = las.ncore + sum (las.ncas_sub[:ifrag])
+            j = i + las.ncas_sub[ifrag]
+            mo_las.append (mo_coeff[:,i:j])
+        mo_las = np.concatenate (mo_las, axis=1)
         ovlp = (imporb_coeff @ self.mo_coeff[:,self.ncore:]).conj ().T @ s0 @ mo_las
         u, svals, vh = linalg.svd (ovlp)
         if (self.ncas>0) and not (np.allclose (svals[:self.ncas],1)):
@@ -501,7 +502,6 @@ def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=N
         '''Update the Hamiltonian data contained within this impurity solver and all encapsulated
         impurity objects'''
         las = self.mol._las
-        _ifrag = self._ifrag
         if h2eff_sub is None: h2eff_sub = las.ao2mo (mo_coeff)
         if e_states is None: e_states = las.energy_nuc () + las.states_energy_elec (
             mo_coeff=mo_coeff, ci=ci, h2eff=h2eff_sub)
@@ -528,9 +528,10 @@ def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=N
         dm1rs_full = las.states_make_casdm1s (ci=ci)
         dm1s_full = np.tensordot (self.fcisolver.weights, dm1rs_full, axes=1)
         dm1rs_stateshift = dm1rs_full - dm1s_full
-        i = sum (las.ncas_sub[:_ifrag])
-        j = i + las.ncas_sub[_ifrag]
-        dm1rs_stateshift[:,:,i:j,:] = dm1rs_stateshift[:,:,:,i:j] = 0
+        for ifrag in self._ifrags:
+            i = sum (las.ncas_sub[:ifrag])
+            j = i + las.ncas_sub[ifrag]
+            dm1rs_stateshift[:,:,i:j,:] = dm1rs_stateshift[:,:,:,i:j] = 0
         bmPu = getattr (h2eff_sub, 'bmPu', None)
         vj_r = self.get_vj_ext (mo_cas_full, dm1rs_stateshift.sum(1), bmPu=bmPu)
         vk_rs = self.get_vk_ext (mo_cas_full, dm1rs_stateshift, bmPu=bmPu)
@@ -591,6 +592,14 @@ def get_hcore_rs (self):
     def energy_nuc_r (self):
         return self._scf.energy_nuc () + self._imporb_h0_stateshift
 
+class ImpurityCASSCF (mcscf.mc1step.CASSCF, ImpuritySolver):
+
+    # make sure the fcisolver flag dump goes to the fragment output file,
+    # not the main output file
+    def dump_flags (self, verbose=None):
+        with lib.temporary_env (self.fcisolver, stdout=self.stdout):
+            mcscf.mc1step.CASSCF.dump_flags(self, verbose=verbose)
+
     def get_h1eff (self, mo_coeff=None, ncas=None, ncore=None):
         ''' must needs change the dimension of h1eff '''
         assert (False)
@@ -808,7 +817,7 @@ def get_impurity_casscf (las, ifrag, imporb_builder=None):
     if isinstance (las, _DFLASCI):
         imc = df.density_fit (imc)
     imc = _state_average_mcscf_solver (imc, las.fciboxes[ifrag])
-    imc._ifrag = ifrag
+    imc._ifrags = [ifrag,]
     if imporb_builder is not None:
         imporb_builder.log = logger.new_logger (imc, imc.verbose)
     imc._imporb_builder = imporb_builder

From 75cd8b02ec540561765472aa65b22ec279b5e9a7 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 23 Jul 2024 14:55:10 -0500
Subject: [PATCH 45/78] safety commit

---
 my_pyscf/mcscf/lasscf_async/crunch.py | 78 ++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 486fbcc3..3c55c217 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -5,7 +5,7 @@
 from pyscf.lib import logger
 from pyscf.fci.direct_spin1 import _unpack_nelec
 from pyscf.mcscf.addons import _state_average_mcscf_solver
-from mrh.my_pyscf.mcscf import _DFLASCI
+from mrh.my_pyscf.mcscf import _DFLASCI, lasci_sync, lasci
 import copy, json
 
 class ImpurityMole (gto.Mole):
@@ -805,6 +805,82 @@ def my_h_op (x):
 
         return g_orb, my_gorb_update, my_h_op, h_diag
 
+class ImpurityLASCI_HessianOperator (lasci_sync.LASCI_HessianOperator):
+    def _init_ham_(self, h2eff_sub, veff):
+        lasci_sync.LASCI_HessianOperator._init_ham_(self, h2eff_sub, veff)
+        las, mo_coeff, ncore, nocc = self.las, self.mo_coeff, self.ncore, self.nocc
+        h1s_sz = mo_coeff.conj ().T @ las._scf.get_hcore_sz () @ mo_coeff
+        self.h1s[0] += h1s_sz
+        self.h1s[1] -= h1s_sz
+        self.h1s_cas[0] += h1s_sz[:,:,ncore:nocc]
+        self.h1s_cas[1] -= h1s_sz[:,:,ncore:nocc]
+        self.e_tot += np.dot (h1s_sz.ravel (), (dm1s[0] - dm1s[1]).ravel ())
+        self.h1rs = np.dot (las.get_hcore_rs (), mo_coeff)
+        self.h1rs = np.tensordot (mo_coeff.conj (), h1rs, axes=((0),(2))).reshape (1,2,0,3)
+        for ix, h1rs in enumerate (self.h1frs):
+            i = sum (self.ncas_sub[:ix])
+            j = i + self.ncas_sub[ix]
+            h1rs[:,:,:,:] += self.h1rs[:,:,i:j,i:j]
+
+    def _init_orb_(self):
+        lasci_sync.LASCI_HessianOperator._init_orb_()
+        for w, h1s, casdm1s in zip (self.weights, self.h1rs, self.casdm1rs):
+            dh1s = h1s[:,ncore:nocc,ncore:nocc] - self.h1s[:,ncore:nocc,ncore:nocc]
+            self.fock1[:,ncore:nocc] += w * (dh1s[0] @ casdm1s[0] + dh1s[1] @ casdm1s[1])
+
+    # TODO: update hessian-vector elements
+
+class ImpurityLASCI (lasci.LASCINoSymm):
+    _hop = ImpurityLASCI_HessianOperator
+    # TODO: get_grad_orb, but it's actually only used for debugging in the kernel
+
+    def h1e_for_las (las, mo_coeff=None, ncas=None, ncore=None, nelecas=None, ci=None,
+                     ncas_sub=None, nelecas_sub=None, veff=None, h2eff_sub=None, casdm1s_sub=None,
+                     casdm1frs=None):
+        h1e_fr = lasci.LASCINoSymm.h1e_for_las (
+            las, mo_coeff=mo_coeff, ncas=ncas, ncore=ncore, nelecas=nelecas, ci=ci, 
+            ncas_sub=ncas_sub, nelecas_sub=nelecas_sub, veff=veff, h2eff_sub=h2eff_sub,
+            casdm1s_sub=casdm1s_sub, casdm1frs=casdm1frs
+        )
+        if mo_coeff is None: mo_coeff = self.mo_coeff
+        if ncas_sub is None: ncas_sub = self.ncas_sub
+        dh1_rs = np.dot (self.get_hcore_rs () - self.get_hcore ()[None,None,:,:], mo_coeff)
+        dh1_rs = np.tensordot (mo_coeff.conj (), dh1_rs, axes=((0),(2))).transpose (1,2,0,3)
+        for ix in range (len (ncas_sub)):
+            i = sum (ncas_sub[:ix])
+            j = i + ncas_sub[ix]
+            h1e_fr[ix] += dh1_rs[:,:,i:j,i:j]
+        return h1e_fr
+
+    def states_energy_elec (self, **kwargs):
+        energy_elec = lasci.LASCINoSymm.states_energy_elec (self, **kwargs)
+        mo_coeff = kwargs.get ('mo_coeff', self.mo_coeff)
+        ci = kwargs.get ('ci', self.ci)
+        ncore = kwargs.get ('ncore', self.ncore)
+        ncas = kwargs.get ('nncas', self.ncas)
+        ncas_sub = kwargs.get ('ncas_sub', self.ncas_sub)
+        nelecas_sub = kwargs.get ('nelecas_sub', self.nelecas_sub)
+        casdm1frs = kwargs.get ('casdm1frs', self.states_make_casdm1s_sub (
+            ci=ci, ncas_sub=ncas_sub, nelecas_sub=nelecas_sub
+        ))
+        casdm1rs = self.states_make_casdm1s (ci=ci, ncas_sub=ncas_sub, nelecas_sub=nelecas_sub,
+                                             casdm1frs=casdm1frs)
+        nao, nmo = mo_shape
+        nocc = ncore + ncas
+        mo_cas = mo_coeff[:,ncore:nocc]
+        dh1_rs = np.dot (self.get_hcore_rs () - self.get_hcore ()[None,None,:,:], mo_cas)
+        dh1_rs = np.tensordot (mo_cas.conj (), dh1_rs, axes=((0),(2))).transpose (1,2,0,3)
+        enuc_r = self.energy_nuc_r ()
+        for ix, (h, d) in enumerate (zip (dh1_rs, casdm1rs)):
+            energy_elec[ix] += np.dot (h.ravel (), d.ravel ())
+            energy_elec[ix] += enuc_r[ix] - self.energy_nuc ()
+        return energy_elec
+
+    def energy_elec (self, **kwargs):
+        energy_elec = self.states_energy_elec (**kwargs)
+        return np.dot (self.weights, energy_elec)
+
+
 def get_impurity_casscf (las, ifrag, imporb_builder=None):
     output = getattr (las.mol, 'output', None)
     # MRH: checking for '/dev/null' specifically as a string is how mol.build does it

From ad712de557c38461cb9e43b3b25e6b4a8201000a Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 23 Jul 2024 16:58:46 -0500
Subject: [PATCH 46/78] get_pair_lasci safety commit

Towards an efficient pairwise relaxation
---
 my_pyscf/mcscf/lasscf_async/crunch.py | 140 +++++++++++++++++++++++---
 1 file changed, 124 insertions(+), 16 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 3c55c217..3c4b8388 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -128,20 +128,23 @@ def _update_impham_1_(self, veff, dm1s, e_tot=None):
         df_eris_mem_error = MemoryError (("Density-fitted two-electron integrals in asynchronous "
                                           "LASSCF (outcore algorithm is not yet supported"))
         if getattr (mf, 'with_df', None) is not None:
-            # TODO: impurity outcore cderi
-            if not self._is_mem_enough (df_naux = mf.with_df.get_naoaux ()):
-                raise df_eris_mem_error
-            self.with_df._cderi = np.empty ((mf.with_df.get_naoaux (), nimp*(nimp+1)//2),
-                                            dtype=imporb_coeff.dtype)
-            ijmosym, mij_pair, moij, ijslice = ao2mo.incore._conc_mos (imporb_coeff, imporb_coeff,
-                                                                        compact=True)
-            b0 = 0
-            for eri1 in mf.with_df.loop ():
-                b1 = b0 + eri1.shape[0]
-                eri2 = self._cderi[b0:b1]
-                eri2 = ao2mo._ao2mo.nr_e2 (eri1, moij, ijslice, aosym='s2', mosym=ijmosym,
-                                           out=eri2)
-                b0 = b1
+            if getattr (self, 'with_df', None) is not None:
+                # TODO: impurity outcore cderi
+                if not self._is_mem_enough (df_naux = mf.with_df.get_naoaux ()):
+                    raise df_eris_mem_error
+                self.with_df._cderi = np.empty ((mf.with_df.get_naoaux (), nimp*(nimp+1)//2),
+                                                dtype=imporb_coeff.dtype)
+                ijmosym, mij_pair, moij, ijslice = ao2mo.incore._conc_mos (imporb_coeff, imporb_coeff,
+                                                                            compact=True)
+                b0 = 0
+                for eri1 in mf.with_df.loop ():
+                    b1 = b0 + eri1.shape[0]
+                    eri2 = self._cderi[b0:b1]
+                    eri2 = ao2mo._ao2mo.nr_e2 (eri1, moij, ijslice, aosym='s2', mosym=ijmosym,
+                                               out=eri2)
+                    b0 = b1
+            else:
+                self._eri = self.with_df.ao2mo (imporb_coeff, compact=True)
         else:
             if getattr (mf, '_eri', None) is None:
                 if not mf._is_mem_enough ():
@@ -806,6 +809,12 @@ def my_h_op (x):
         return g_orb, my_gorb_update, my_h_op, h_diag
 
 class ImpurityLASCI_HessianOperator (lasci_sync.LASCI_HessianOperator):
+    def _init_dms_(self, casdm1frs, casdm2fr):
+        lasci_sync.LASCI_HessianOperator._init_dms_(self, casdm1frs, casdm2fr)
+        ncore, nocc, nroots = self.ncore, self.nocc, self.nroots
+        self.dm1rs = np.stack ([self.dm1s,]*nroots, axis=0)
+        self.dm1rs[:,:,ncore:nocc,ncore:nocc] = self.casdm1rs
+
     def _init_ham_(self, h2eff_sub, veff):
         lasci_sync.LASCI_HessianOperator._init_ham_(self, h2eff_sub, veff)
         las, mo_coeff, ncore, nocc = self.las, self.mo_coeff, self.ncore, self.nocc
@@ -821,6 +830,7 @@ def _init_ham_(self, h2eff_sub, veff):
             i = sum (self.ncas_sub[:ix])
             j = i + self.ncas_sub[ix]
             h1rs[:,:,:,:] += self.h1rs[:,:,i:j,i:j]
+            # NOTE: this accounts for ci_response_diag 
 
     def _init_orb_(self):
         lasci_sync.LASCI_HessianOperator._init_orb_()
@@ -828,11 +838,76 @@ def _init_orb_(self):
             dh1s = h1s[:,ncore:nocc,ncore:nocc] - self.h1s[:,ncore:nocc,ncore:nocc]
             self.fock1[:,ncore:nocc] += w * (dh1s[0] @ casdm1s[0] + dh1s[1] @ casdm1s[1])
 
-    # TODO: update hessian-vector elements
+    def ci_response_offdiag (self, kappa1, h1s_prime):
+        ncore, nocc, ncas_sub, nroots = self.ncore, self.nocc, self.ncas_sub, self.nroots
+        kappa1_cas = kappa1[ncore:nocc,:]
+        h1frs = [np.zeros_like (h1) for h1 in h1frs_prime]
+        ## edit begin for hcore_rs
+        h1rs_cas = self.h1rs[:,:,:,ncore:nocc]
+        h1_core = -np.tensordot (kappa1_cas, h1rs_cas, axes=((1),(2))).transpose (1,2,0,3)
+        h1_core += h1_core.transpose (0,1,3,2)
+        ## edit end for hcore_rs
+        h2 = -np.tensordot (kappa1_cas, self.eri_paaa, axes=1)
+        h2 += h2.transpose (2,3,0,1)
+        h2 += h2.transpose (1,0,3,2)
+        # ^ h2 should also include + h.c.
+        for j, casdm1s in enumerate (self.casdm1rs):
+            for i, (h1rs, h1rs_prime) in enumerate (zip (h1frs, h1frs_prime)):
+                k = sum (ncas_sub[:i])
+                l = k + ncas_sub[i]
+                h1s, h1s_prime = h1rs[j], h1rs_prime[j]
+                dm1s = casdm1s.copy ()
+                dm1s[:,k:l,k:l] = 0.0 # no double-counting
+                dm1 = dm1s.sum (0)
+                h1s[:,:,:] = h1_core[j][:,k:l,k:l].copy ()
+                h1s[:,:,:] += np.tensordot (h2, dm1, axes=2)[None,k:l,k:l]
+                h1s[:,:,:] -= np.tensordot (dm1s, h2, axes=((1,2),(2,1)))[:,k:l,k:l]
+                #h1s[:,:,:] += h1s.transpose (0,2,1)
+                h1s[:,:,:] += h1s_prime[:,:,:]
+        Kci0 = self.Hci_all (None, h1frs, h2, self.ci)
+        Kci0 = [[Kc - c*(c.dot (Kc)) for Kc, c in zip (Kcr, cr)]
+                for Kcr, cr in zip (Kci0, self.ci)]
+        # ^ The definition of the unitary group generator compels you to do this always!!!
+        return Kci0
+
+    def orbital_response (self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime):
+        kappa2 = lasci_sync.LASCI_Hessian_operator.orbital_response (
+            self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime
+        )
+        h1rs = self.h1rs - self.h1s[None,:,:,:]
+        odm1rs = -np.dot (self.dm1rs, kappa1)
+        odm1rs += odm1rs.transpose (0,1,3,2)
+        edm1rs = odm1rs + tdm1rs
+        for w, h, d in zip (self.weights, h1rs, edm1rs):
+            fock1 = h[0] @ d[0] + h[1] @ d[1]
+            kappa2 += w * (fock1 - fock1.T)
+        return kappa2
 
 class ImpurityLASCI (lasci.LASCINoSymm):
     _hop = ImpurityLASCI_HessianOperator
-    # TODO: get_grad_orb, but it's actually only used for debugging in the kernel
+
+    def get_grad_orb (las, mo_coeff=None, ci=None, h2eff_sub=None, veff=None, dm1s=None, hermi=-1):
+        gorb = lasci.LASCINoSymm.get_grad_orb (las, mo_coeff=mo_coeff, ci=ci, h2eff_sub=h2eff_sub,
+                                               veff=veff, dm1s=dm1s, hermi=hermi)
+        if mo_coeff is None: mo_coeff = las.mo_coeff
+        nao, nmo = las.mo_coeff.shape
+        ncore, ncas = las.ncore, las.ncas
+        nocc = ncore + ncas
+        mo_cas = mo_coeff[:,ncore:nocc]
+        dh1_rs = np.dot (self.get_hcore_rs () - self.get_hcore ()[None,None,:,:], mo_cas)
+        dh1_rs = np.tensordot (mo_coeff.conj (), dh1_rs, axes=((0),(2))).transpose (1,2,0,3)
+        casdm1rs = las.states_make_casdm1s (ci=ci)
+        f = np.zeros ((nmo,nmo), dtype=gorb.dtype)
+        for w, h, d in zip (las.weights, dh1_rs, casdm1rs):
+            f[:,ncore:nocc] += w * (h[0] @ d[0] + h[1] @ d[1])
+        if hermi == -1:
+            return gorb + f - f.T
+        elif hermi == 1:
+            return gorb + .5*(f+f.T)
+        elif hermi == 0:
+            return gorb + f
+        else:
+            raise ValueError ("kwarg 'hermi' must = -1, 0, or +1")
 
     def h1e_for_las (las, mo_coeff=None, ncas=None, ncore=None, nelecas=None, ci=None,
                      ncas_sub=None, nelecas_sub=None, veff=None, h2eff_sub=None, casdm1s_sub=None,
@@ -903,6 +978,39 @@ def get_impurity_casscf (las, ifrag, imporb_builder=None):
     imc.__dict__.update (params.get (ifrag, {}))
     return imc
 
+def get_pair_lasci (las, frags):
+    stdout = getattr (las, '_flas_stdout', None)
+    if stdout is not None: stdout = stdout.get (unfrozen_frags, None)
+    output = getattr (las.mol, 'output', None)
+    if not ((output is None) or (output=='/dev/null')):
+        output = output + '.' + '.'.join ([str (s) for s in frags])
+    imol = ImpurityMole (las, output=output, stdout=stdout)
+    imf = ImpurityHF (imol)
+    ncas_sub = [las.ncas_sub[i] for i in frags]
+    nelecas_sub = [las.nelecas_sub[i] for i in frags]
+    ilas = ImpurityLASCI (imf, ncas_sub, nelecas_sub)
+    charges, spins, smults, wfnsyms = lasci.get_space_info (las)
+    ilas.state_average_(weights=las.weights, charges=charges[:,frags], spins=spins[:,frags],
+                        smults=smults[:,frags], wfnsyms=wfnsyms[:,frags])
+    def imporb_builder (mo_coeff, dm1s, veff, fock1, **kwargs):
+        idx = np.zeros (mo_coeff.shape[1], dtype=bool)
+        for ix in frags:    
+            i = ncore + sum (las.ncas_sub[:ix])
+            j = i + las.ncas_sub[ix]
+            idx[i:j] = True
+        fo_coeff = mo_coeff[:,idx]
+        nelec_f = sum ([sum (n) for n in nelecas_sub])
+        return fo_coeff, nelec_f
+    ilas._imporb_builder = imporb_builder
+    params = getattr (las, 'relax_params', {})
+    glob = {key: val for key, val in params.items () if isinstance (key, str)}
+    glob = {key: val for key, val in glob.items () if key not in ('frozen', 'frozen_ci')}
+    ilas.__dict__.update (glob)
+    loc = params.get (tuple (frags), {})
+    loc = {key: val for key, val in loc.items () if key not in ('frozen', 'frozen_ci')}
+    ilas.__dict__.update (loc)
+    return ilas
+
 if __name__=='__main__':
     from mrh.tests.lasscf.c2h6n4_struct import structure as struct
     mol = struct (1.0, 1.0, '6-31g', symmetry=False)

From e496a2b92b2c33481a4d0b4574c9ac68fdfcd32f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 23 Jul 2024 17:46:11 -0500
Subject: [PATCH 47/78] get_pair_lasci safety commit

Some syntax debugging
---
 my_pyscf/mcscf/lasscf_async/combine.py | 6 +++++-
 my_pyscf/mcscf/lasscf_async/crunch.py  | 7 ++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index fe75d06f..f6745260 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -5,7 +5,7 @@
 from pyscf.lo import orth
 from pyscf.scf.rohf import get_roothaan_fock
 from mrh.my_pyscf.mcscf import lasci, _DFLASCI
-from mrh.my_pyscf.mcscf.lasscf_async import keyframe
+from mrh.my_pyscf.mcscf.lasscf_async import keyframe, crunch
 
 # TODO: symmetry
 def orth_orb (las, kf2_list, kf_ref=None):
@@ -222,6 +222,10 @@ def combine_pair (las, kf1, kf2, kf_ref=None):
     kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf_ref)
     i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
     kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
+    #pair = crunch.get_pair_lasci (las, (i,j))
+    #pair._pull_keyframe_(kf3)
+    #pair.kernel ()
+    #kf3 = pair._push_keyframe (kf3)
     kf3.frags = kf1.frags.union (kf2.frags)
     return kf3
 
diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 3c4b8388..c94801b0 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -883,7 +883,7 @@ def orbital_response (self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime):
             kappa2 += w * (fock1 - fock1.T)
         return kappa2
 
-class ImpurityLASCI (lasci.LASCINoSymm):
+class ImpurityLASCI (lasci.LASCINoSymm, ImpuritySolver):
     _hop = ImpurityLASCI_HessianOperator
 
     def get_grad_orb (las, mo_coeff=None, ci=None, h2eff_sub=None, veff=None, dm1s=None, hermi=-1):
@@ -980,7 +980,7 @@ def get_impurity_casscf (las, ifrag, imporb_builder=None):
 
 def get_pair_lasci (las, frags):
     stdout = getattr (las, '_flas_stdout', None)
-    if stdout is not None: stdout = stdout.get (unfrozen_frags, None)
+    if stdout is not None: stdout = stdout.get (frags, None)
     output = getattr (las.mol, 'output', None)
     if not ((output is None) or (output=='/dev/null')):
         output = output + '.' + '.'.join ([str (s) for s in frags])
@@ -995,13 +995,14 @@ def get_pair_lasci (las, frags):
     def imporb_builder (mo_coeff, dm1s, veff, fock1, **kwargs):
         idx = np.zeros (mo_coeff.shape[1], dtype=bool)
         for ix in frags:    
-            i = ncore + sum (las.ncas_sub[:ix])
+            i = las.ncore + sum (las.ncas_sub[:ix])
             j = i + las.ncas_sub[ix]
             idx[i:j] = True
         fo_coeff = mo_coeff[:,idx]
         nelec_f = sum ([sum (n) for n in nelecas_sub])
         return fo_coeff, nelec_f
     ilas._imporb_builder = imporb_builder
+    ilas._ifrags = frags
     params = getattr (las, 'relax_params', {})
     glob = {key: val for key, val in params.items () if isinstance (key, str)}
     glob = {key: val for key, val in glob.items () if key not in ('frozen', 'frozen_ci')}

From 04982e19e5f5a6a0af1b31cd8c0ca18c5d1d3fc5 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 24 Jul 2024 14:00:09 -0500
Subject: [PATCH 48/78] lasscf_async combine_pair refactor syntax safety

The refactor is currently not converging, but not crashing
---
 debug/lasscf/debug_lasscf_async.py    |  6 +--
 my_pyscf/mcscf/lasci.py               |  5 +-
 my_pyscf/mcscf/lasscf_async/crunch.py | 73 +++++++++++++++++++--------
 tests/lasscf/test_lasscf_async.py     |  6 +--
 4 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/debug/lasscf/debug_lasscf_async.py b/debug/lasscf/debug_lasscf_async.py
index abe1b5f3..bbca9390 100644
--- a/debug/lasscf/debug_lasscf_async.py
+++ b/debug/lasscf/debug_lasscf_async.py
@@ -40,12 +40,12 @@ def _run_mod (mod):
 class KnownValues (unittest.TestCase):
 
     def test_implementations (self):
-        las_syn = _run_mod (syn)
-        with self.subTest ('synchronous calculation converged'):
-            self.assertTrue (las_syn.converged)
         las_asyn = _run_mod (asyn)
         with self.subTest ('asynchronous calculation converged'):
             self.assertTrue (las_asyn.converged)
+        las_syn = _run_mod (syn)
+        with self.subTest ('synchronous calculation converged'):
+            self.assertTrue (las_syn.converged)
         with self.subTest ('average energy'):
             self.assertAlmostEqual (las_syn.e_tot, las_asyn.e_tot, 8)
         for i in range (5):
diff --git a/my_pyscf/mcscf/lasci.py b/my_pyscf/mcscf/lasci.py
index 61bd2d0e..e8e37d5d 100644
--- a/my_pyscf/mcscf/lasci.py
+++ b/my_pyscf/mcscf/lasci.py
@@ -465,7 +465,8 @@ def canonicalize (las, mo_coeff=None, ci=None, casdm1fs=None, natorb_casdm1=None
 
     # I/O
     log = lib.logger.new_logger (las, las.verbose)
-    if las.verbose >= lib.logger.INFO:
+    label = las.mol.ao_labels()
+    if las.verbose >= lib.logger.INFO and len (label) == mo_coeff.shape[0]:
         if is_block_diag:
             for isub, nlas in enumerate (ncas_sub):
                 log.info ("Fragment %d natural orbitals", isub)
@@ -473,14 +474,12 @@ def canonicalize (las, mo_coeff=None, ci=None, casdm1fs=None, natorb_casdm1=None
                 j = i + nlas
                 log.info ('Natural occ %s', str (mo_occ[i:j]))
                 log.info ('Natural orbital (expansion on AOs) in CAS space')
-                label = las.mol.ao_labels()
                 mo_las = mo_coeff[:,i:j]
                 dump_mat.dump_rec(log.stdout, mo_las, label, start=1)
         else:
             log.info ("Delocalized natural orbitals do not reflect LAS fragmentation")
             log.info ('Natural occ %s', str (mo_occ[ncore:nocc]))
             log.info ('Natural orbital (expansion on AOs) in CAS space')
-            label = las.mol.ao_labels()
             mo_las = mo_coeff[:,ncore:nocc]
             dump_mat.dump_rec(log.stdout, mo_las, label, start=1)
 
diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index c94801b0..31d7dbda 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -66,8 +66,10 @@ def skip_value(dic):
                 dic1 = {}
                 for k,v in dic.items():
                     if (v is None or
-                        isinstance(v, (str, unicode, bool, int, float))):
+                        isinstance(v, (str, bool, int, float))):
                         dic1[k] = v
+                    elif isinstance(v, np.integer):
+                        dic1[k] = int (v)
                     elif isinstance(v, (list, tuple)):
                         dic1[k] = v   # Should I recursively skip_vaule?
                     elif isinstance(v, set):
@@ -358,13 +360,13 @@ def _push_keyframe (self, kf1, mo_coeff=None, ci=None):
 
         # active orbital part should be easy
         ci = self.ci if len (self._ifrags)>1 else [self.ci,]
+        idx = []
         for ix, ifrag in enumerate (self._ifrags):
             kf2.ci[ifrag] = ci[ix]
             i = las.ncore + sum (las.ncas_sub[:ifrag])
             j = i + las.ncas_sub[ifrag]
-            k = self.ncore
-            l = k + self.ncas
-            kf2.mo_coeff[:,i:j] = mo_self[:,k:l]
+            idx.extend (list (range (i,j)))
+        kf2.mo_coeff[:,idx] = mo_self[:,self.ncore:self.ncore+self.ncas]
 
         # Unentangled inactive orbitals
         s0 = las._scf.get_ovlp ()
@@ -501,7 +503,8 @@ def _update_trial_state_(self, mo_coeff, ci, veff, dm1s):
             w, c = linalg.eigh (fock_virt)
             self.mo_coeff[:,nocc:] = mo_virt @ c
 
-    def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=None, veff=None, dm1s=None):
+    def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=None, veff=None,
+                                      dm1s=None, casdm1rs=None, casdm2rs=None, weights=None):
         '''Update the Hamiltonian data contained within this impurity solver and all encapsulated
         impurity objects'''
         las = self.mol._las
@@ -513,15 +516,20 @@ def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=N
         if veff is None: veff = las.get_veff (dm1s=dm1s, spin_sep=True)
         nocc = self.ncore + self.ncas
 
+        # Default these to the "CASSCF" way of making them
+        if weights is None: weights = self.fcisolver.weights
+        if casdm1rs is None or casdm2rs is None:
+            casdm1rs, casdm2rs = self.fcisolver.states_make_rdm12s (self.ci,self.ncas,self.nelecas)
+            casdm1rs = np.stack (casdm1rs, axis=1)
+            casdm2rs = np.stack (casdm2rs, axis=1)
+
         # Set underlying SCF object Hamiltonian to state-averaged Heff
         self._scf._update_impham_1_(veff, dm1s, e_tot=e_tot)
-        casdm1rs, casdm2rs = self.fcisolver.states_make_rdm12s (self.ci, self.ncas, self.nelecas)
-        casdm1rs = np.stack (casdm1rs, axis=1)
-        casdm2sr = np.stack (casdm2rs, axis=0)
+        casdm2sr = casdm2rs.transpose (1,0,2,3,4,5)
         casdm2r = casdm2sr[0] + casdm2sr[1] + casdm2sr[1].transpose (0,3,4,1,2) + casdm2sr[2]
-        casdm1s = np.tensordot (self.fcisolver.weights, casdm1rs, axes=1)
-        casdm2 = np.tensordot (self.fcisolver.weights, casdm2r, axes=1)
-        eri_cas = ao2mo.restore (1, self.get_h2eff (self.mo_coeff), self.ncas)
+        casdm1s = np.tensordot (weights, casdm1rs, axes=1)
+        casdm2 = np.tensordot (weights, casdm2r, axes=1)
+        eri_cas = ao2mo.restore (1, self.get_h2cas (self.mo_coeff), self.ncas)
         mo_core = self.mo_coeff[:,:self.ncore]
         mo_cas = self.mo_coeff[:,self.ncore:nocc]
         self._scf._update_impham_2_(mo_core, mo_cas, casdm1s, casdm2, eri_cas)
@@ -529,7 +537,7 @@ def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=N
         # Set state-separated Hamiltonian 1-body
         mo_cas_full = mo_coeff[:,las.ncore:][:,:las.ncas]
         dm1rs_full = las.states_make_casdm1s (ci=ci)
-        dm1s_full = np.tensordot (self.fcisolver.weights, dm1rs_full, axes=1)
+        dm1s_full = np.tensordot (weights, dm1rs_full, axes=1)
         dm1rs_stateshift = dm1rs_full - dm1s_full
         for ifrag in self._ifrags:
             i = sum (las.ncas_sub[:ifrag])
@@ -821,11 +829,11 @@ def _init_ham_(self, h2eff_sub, veff):
         h1s_sz = mo_coeff.conj ().T @ las._scf.get_hcore_sz () @ mo_coeff
         self.h1s[0] += h1s_sz
         self.h1s[1] -= h1s_sz
-        self.h1s_cas[0] += h1s_sz[:,:,ncore:nocc]
-        self.h1s_cas[1] -= h1s_sz[:,:,ncore:nocc]
-        self.e_tot += np.dot (h1s_sz.ravel (), (dm1s[0] - dm1s[1]).ravel ())
+        self.h1s_cas[0] += h1s_sz[:,ncore:nocc]
+        self.h1s_cas[1] -= h1s_sz[:,ncore:nocc]
+        self.e_tot += np.dot (h1s_sz.ravel (), (self.dm1s[0] - self.dm1s[1]).ravel ())
         self.h1rs = np.dot (las.get_hcore_rs (), mo_coeff)
-        self.h1rs = np.tensordot (mo_coeff.conj (), h1rs, axes=((0),(2))).reshape (1,2,0,3)
+        self.h1rs = np.tensordot (mo_coeff.conj (), self.h1rs, axes=((0),(2))).transpose (1,2,0,3)
         for ix, h1rs in enumerate (self.h1frs):
             i = sum (self.ncas_sub[:ix])
             j = i + self.ncas_sub[ix]
@@ -833,12 +841,13 @@ def _init_ham_(self, h2eff_sub, veff):
             # NOTE: this accounts for ci_response_diag 
 
     def _init_orb_(self):
-        lasci_sync.LASCI_HessianOperator._init_orb_()
+        ncore, nocc = self.ncore, self.nocc
+        lasci_sync.LASCI_HessianOperator._init_orb_(self)
         for w, h1s, casdm1s in zip (self.weights, self.h1rs, self.casdm1rs):
             dh1s = h1s[:,ncore:nocc,ncore:nocc] - self.h1s[:,ncore:nocc,ncore:nocc]
             self.fock1[:,ncore:nocc] += w * (dh1s[0] @ casdm1s[0] + dh1s[1] @ casdm1s[1])
 
-    def ci_response_offdiag (self, kappa1, h1s_prime):
+    def ci_response_offdiag (self, kappa1, h1frs_prime):
         ncore, nocc, ncas_sub, nroots = self.ncore, self.nocc, self.ncas_sub, self.nroots
         kappa1_cas = kappa1[ncore:nocc,:]
         h1frs = [np.zeros_like (h1) for h1 in h1frs_prime]
@@ -871,7 +880,7 @@ def ci_response_offdiag (self, kappa1, h1s_prime):
         return Kci0
 
     def orbital_response (self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime):
-        kappa2 = lasci_sync.LASCI_Hessian_operator.orbital_response (
+        kappa2 = lasci_sync.LASCI_HessianOperator.orbital_response (
             self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime
         )
         h1rs = self.h1rs - self.h1s[None,:,:,:]
@@ -886,6 +895,30 @@ def orbital_response (self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime):
 class ImpurityLASCI (lasci.LASCINoSymm, ImpuritySolver):
     _hop = ImpurityLASCI_HessianOperator
 
+    def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=None, veff=None,
+                                      dm1s=None, casdm1rs=None, casdm2rs=None, weights=None):
+        if weights is None: weights = self.weights
+        if casdm1rs is None: casdm1rs = self.states_make_casdm1s (ci=self.ci)
+        if casdm2rs is None: 
+            casdm2frs = self.states_make_casdm2s_sub (ci=self.ci)
+            nroots = len (casdm1rs)
+            ncas = casdm1rs[0][0].shape[0]
+            casdm2rs = np.zeros ((nroots,3,ncas,ncas,ncas,ncas), dtype=casdm1rs[0][0].dtype)
+            for d2, d1 in zip (casdm2rs, casdm1rs):
+                d1d1_aa = np.multiply.outer (d1[0], d1[0])
+                d2[0] = d1d1_aa - d1d1_aa.transpose (0,3,2,1)
+                d2[1] = np.multiply.outer (d1[0], d1[1])
+                d1d1_bb = np.multiply.outer (d1[1], d1[1])
+                d2[2] = d1d1_bb - d1d1_bb.transpose (0,3,2,1)
+            for ifrag, d2f in enumerate (casdm2frs):
+                i = sum (self.ncas_sub[:ifrag])
+                j = i + self.ncas_sub[ifrag]
+                casdm2rs[:,:,i:j,i:j,i:j,i:j] = d2f[:]
+        ImpuritySolver._update_impurity_hamiltonian_(
+            self, mo_coeff, ci, h2eff_sub=h2eff_sub, e_states=e_states, veff=veff, dm1s=dm1s,
+            casdm1rs=casdm1rs, casdm2rs=casdm2rs, weights=weights
+        )
+
     def get_grad_orb (las, mo_coeff=None, ci=None, h2eff_sub=None, veff=None, dm1s=None, hermi=-1):
         gorb = lasci.LASCINoSymm.get_grad_orb (las, mo_coeff=mo_coeff, ci=ci, h2eff_sub=h2eff_sub,
                                                veff=veff, dm1s=dm1s, hermi=hermi)
@@ -940,7 +973,7 @@ def states_energy_elec (self, **kwargs):
         ))
         casdm1rs = self.states_make_casdm1s (ci=ci, ncas_sub=ncas_sub, nelecas_sub=nelecas_sub,
                                              casdm1frs=casdm1frs)
-        nao, nmo = mo_shape
+        nao, nmo = mo_coeff.shape
         nocc = ncore + ncas
         mo_cas = mo_coeff[:,ncore:nocc]
         dh1_rs = np.dot (self.get_hcore_rs () - self.get_hcore ()[None,None,:,:], mo_cas)
diff --git a/tests/lasscf/test_lasscf_async.py b/tests/lasscf/test_lasscf_async.py
index b25a7db4..20c9e49d 100644
--- a/tests/lasscf/test_lasscf_async.py
+++ b/tests/lasscf/test_lasscf_async.py
@@ -41,12 +41,12 @@ def _run_mod (mod):
 class KnownValues (unittest.TestCase):
 
     def test_implementations (self):
-        las_syn = _run_mod (syn)
-        with self.subTest ('synchronous calculation converged'):
-            self.assertTrue (las_syn.converged)
         las_asyn = _run_mod (asyn)
         with self.subTest ('asynchronous calculation converged'):
             self.assertTrue (las_asyn.converged)
+        las_syn = _run_mod (syn)
+        with self.subTest ('synchronous calculation converged'):
+            self.assertTrue (las_syn.converged)
         with self.subTest ('average energy'):
             self.assertAlmostEqual (las_syn.e_tot, las_asyn.e_tot, 7)
         for i in range (5):

From 5f1188e7c4f952a28f0dba6fcea3f27528029323 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 24 Jul 2024 18:50:32 -0500
Subject: [PATCH 49/78] lasscf_async pair relaxation refactor complete

---
 debug/lasscf/debug_lasscf_async.py     |  1 +
 my_pyscf/mcscf/lasscf_async/combine.py | 14 +++++++----
 my_pyscf/mcscf/lasscf_async/crunch.py  | 35 +++++++++++++++++---------
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/debug/lasscf/debug_lasscf_async.py b/debug/lasscf/debug_lasscf_async.py
index bbca9390..8ad6a0e8 100644
--- a/debug/lasscf/debug_lasscf_async.py
+++ b/debug/lasscf/debug_lasscf_async.py
@@ -29,6 +29,7 @@ def tearDownModule():
 
 def _run_mod (mod):
     las=mod.LASSCF(mf, (2,2), (2,2))
+    las.conv_tol_grad = 1e-7
     localize_fn = getattr (las, 'set_fragments_', las.localize_init_guess)
     mo_coeff=localize_fn (frag_atom_list, mo0)
     las.state_average_(weights=[.2,]*5,
diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index f6745260..cca2e3eb 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -221,11 +221,15 @@ def combine_pair (las, kf1, kf2, kf_ref=None):
         raise RuntimeError (errstr)
     kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf_ref)
     i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
-    kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
-    #pair = crunch.get_pair_lasci (las, (i,j))
-    #pair._pull_keyframe_(kf3)
-    #pair.kernel ()
-    #kf3 = pair._push_keyframe (kf3)
+    #kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
+    pair = crunch.get_pair_lasci (las, (i,j))
+    pair._pull_keyframe_(kf3)
+    if pair.conv_tol_grad == 'DEFAULT':
+        # Default: scale down conv_tol_grad according to size of subproblem
+        scale = np.sqrt (pair.get_ugg ().nvar_tot / las.get_ugg ().nvar_tot)
+        pair.conv_tol_grad = scale * las.conv_tol_grad
+    pair.kernel ()
+    kf3 = pair._push_keyframe (kf3)
     kf3.frags = kf1.frags.union (kf2.frags)
     return kf3
 
diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 31d7dbda..0ba6c015 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -146,7 +146,7 @@ def _update_impham_1_(self, veff, dm1s, e_tot=None):
                                                out=eri2)
                     b0 = b1
             else:
-                self._eri = self.with_df.ao2mo (imporb_coeff, compact=True)
+                self._eri = mf.with_df.ao2mo (imporb_coeff, compact=True)
         else:
             if getattr (mf, '_eri', None) is None:
                 if not mf._is_mem_enough ():
@@ -826,19 +826,23 @@ def _init_dms_(self, casdm1frs, casdm2fr):
     def _init_ham_(self, h2eff_sub, veff):
         lasci_sync.LASCI_HessianOperator._init_ham_(self, h2eff_sub, veff)
         las, mo_coeff, ncore, nocc = self.las, self.mo_coeff, self.ncore, self.nocc
+        h1rs = np.dot (las.get_hcore_rs (), mo_coeff)
+        h1rs = np.tensordot (mo_coeff.conj (), h1rs, axes=((0),(2))).transpose (1,2,0,3)
+        hcore = mo_coeff.conj ().T @ las.get_hcore () @ mo_coeff
+        dh1rs = h1rs - hcore[None,None,:,:]
+        for ix, h1rs in enumerate (self.h1frs):
+            i = sum (self.ncas_sub[:ix])
+            j = i + self.ncas_sub[ix]
+            h1rs[:,:,:,:] += dh1rs[:,:,i:j,i:j]
+            # NOTE: this accounts for ci_response_diag
+        self.h1rs = self.h1s[None,:,:,:] + dh1rs
+        self.h1rs_cas = self.h1s_cas[None,:,:,:] + dh1rs[:,:,:,ncore:nocc]
         h1s_sz = mo_coeff.conj ().T @ las._scf.get_hcore_sz () @ mo_coeff
         self.h1s[0] += h1s_sz
         self.h1s[1] -= h1s_sz
         self.h1s_cas[0] += h1s_sz[:,ncore:nocc]
         self.h1s_cas[1] -= h1s_sz[:,ncore:nocc]
-        self.e_tot += np.dot (h1s_sz.ravel (), (self.dm1s[0] - self.dm1s[1]).ravel ())
-        self.h1rs = np.dot (las.get_hcore_rs (), mo_coeff)
-        self.h1rs = np.tensordot (mo_coeff.conj (), self.h1rs, axes=((0),(2))).transpose (1,2,0,3)
-        for ix, h1rs in enumerate (self.h1frs):
-            i = sum (self.ncas_sub[:ix])
-            j = i + self.ncas_sub[ix]
-            h1rs[:,:,:,:] += self.h1rs[:,:,i:j,i:j]
-            # NOTE: this accounts for ci_response_diag 
+        self.e_tot += np.einsum ('rspq,rspq,r->', dh1rs, self.dm1rs, self.weights)
 
     def _init_orb_(self):
         ncore, nocc = self.ncore, self.nocc
@@ -852,8 +856,7 @@ def ci_response_offdiag (self, kappa1, h1frs_prime):
         kappa1_cas = kappa1[ncore:nocc,:]
         h1frs = [np.zeros_like (h1) for h1 in h1frs_prime]
         ## edit begin for hcore_rs
-        h1rs_cas = self.h1rs[:,:,:,ncore:nocc]
-        h1_core = -np.tensordot (kappa1_cas, h1rs_cas, axes=((1),(2))).transpose (1,2,0,3)
+        h1_core = -np.tensordot (kappa1_cas, self.h1rs_cas, axes=((1),(2))).transpose (1,2,0,3)
         h1_core += h1_core.transpose (0,1,3,2)
         ## edit end for hcore_rs
         h2 = -np.tensordot (kappa1_cas, self.eri_paaa, axes=1)
@@ -1012,16 +1015,22 @@ def get_impurity_casscf (las, ifrag, imporb_builder=None):
     return imc
 
 def get_pair_lasci (las, frags):
-    stdout = getattr (las, '_flas_stdout', None)
+    stdout_dict = stdout = getattr (las, '_flas_stdout', None)
     if stdout is not None: stdout = stdout.get (frags, None)
     output = getattr (las.mol, 'output', None)
     if not ((output is None) or (output=='/dev/null')):
         output = output + '.' + '.'.join ([str (s) for s in frags])
     imol = ImpurityMole (las, output=output, stdout=stdout)
+    if stdout is None and stdout_dict is not None:
+        stdout_dict[frags] = imol.stdout
     imf = ImpurityHF (imol)
+    if isinstance (las, _DFLASCI):
+        imf = imf.density_fit ()
     ncas_sub = [las.ncas_sub[i] for i in frags]
     nelecas_sub = [las.nelecas_sub[i] for i in frags]
     ilas = ImpurityLASCI (imf, ncas_sub, nelecas_sub)
+    if isinstance (las, _DFLASCI):
+        ilas = lasci.density_fit (ilas, with_df=imf.with_df)
     charges, spins, smults, wfnsyms = lasci.get_space_info (las)
     ilas.state_average_(weights=las.weights, charges=charges[:,frags], spins=spins[:,frags],
                         smults=smults[:,frags], wfnsyms=wfnsyms[:,frags])
@@ -1036,6 +1045,8 @@ def imporb_builder (mo_coeff, dm1s, veff, fock1, **kwargs):
         return fo_coeff, nelec_f
     ilas._imporb_builder = imporb_builder
     ilas._ifrags = frags
+    ilas.conv_tol_grad = 'DEFAULT'
+    ilas.min_cycle_macro = 1
     params = getattr (las, 'relax_params', {})
     glob = {key: val for key, val in params.items () if isinstance (key, str)}
     glob = {key: val for key, val in glob.items () if key not in ('frozen', 'frozen_ci')}

From 40ac088138366f30ab52118393d8e45ff9af0bec Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 25 Jul 2024 11:56:39 -0500
Subject: [PATCH 50/78] cleanup LASCI_HessianOperator child class

---
 my_pyscf/mcscf/lasscf_async/crunch.py | 56 +++++++++++----------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 0ba6c015..dc9ddea4 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -830,18 +830,16 @@ def _init_ham_(self, h2eff_sub, veff):
         h1rs = np.tensordot (mo_coeff.conj (), h1rs, axes=((0),(2))).transpose (1,2,0,3)
         hcore = mo_coeff.conj ().T @ las.get_hcore () @ mo_coeff
         dh1rs = h1rs - hcore[None,None,:,:]
+        # _init_ci_ and ci_response_diag
         for ix, h1rs in enumerate (self.h1frs):
             i = sum (self.ncas_sub[:ix])
             j = i + self.ncas_sub[ix]
             h1rs[:,:,:,:] += dh1rs[:,:,i:j,i:j]
-            # NOTE: this accounts for ci_response_diag
+        # _init_orb_ and orbital_response 
         self.h1rs = self.h1s[None,:,:,:] + dh1rs
+        # ci_response_offdiag
         self.h1rs_cas = self.h1s_cas[None,:,:,:] + dh1rs[:,:,:,ncore:nocc]
-        h1s_sz = mo_coeff.conj ().T @ las._scf.get_hcore_sz () @ mo_coeff
-        self.h1s[0] += h1s_sz
-        self.h1s[1] -= h1s_sz
-        self.h1s_cas[0] += h1s_sz[:,ncore:nocc]
-        self.h1s_cas[1] -= h1s_sz[:,ncore:nocc]
+        # Energy reportback
         self.e_tot += np.einsum ('rspq,rspq,r->', dh1rs, self.dm1rs, self.weights)
 
     def _init_orb_(self):
@@ -851,36 +849,26 @@ def _init_orb_(self):
             dh1s = h1s[:,ncore:nocc,ncore:nocc] - self.h1s[:,ncore:nocc,ncore:nocc]
             self.fock1[:,ncore:nocc] += w * (dh1s[0] @ casdm1s[0] + dh1s[1] @ casdm1s[1])
 
+    def _get_Horb_diag (self):
+        # It's unclear that this is even necessary...
+        Hdiag = 0
+        for w, h, d in zip (self.weights, self.h1rs, self.dm1rs):
+            with lib.temporary_env (self, h1s=h, dm1s=d):
+                Hdiag += w * lasci_sync.LASCI_HessianOperator._get_Horb_diag (self)
+        return Hdiag
+
     def ci_response_offdiag (self, kappa1, h1frs_prime):
-        ncore, nocc, ncas_sub, nroots = self.ncore, self.nocc, self.ncas_sub, self.nroots
+        ncore, nocc, ncas_sub = self.ncore, self.nocc, self.ncas_sub
         kappa1_cas = kappa1[ncore:nocc,:]
-        h1frs = [np.zeros_like (h1) for h1 in h1frs_prime]
-        ## edit begin for hcore_rs
-        h1_core = -np.tensordot (kappa1_cas, self.h1rs_cas, axes=((1),(2))).transpose (1,2,0,3)
-        h1_core += h1_core.transpose (0,1,3,2)
-        ## edit end for hcore_rs
-        h2 = -np.tensordot (kappa1_cas, self.eri_paaa, axes=1)
-        h2 += h2.transpose (2,3,0,1)
-        h2 += h2.transpose (1,0,3,2)
-        # ^ h2 should also include + h.c.
-        for j, casdm1s in enumerate (self.casdm1rs):
-            for i, (h1rs, h1rs_prime) in enumerate (zip (h1frs, h1frs_prime)):
-                k = sum (ncas_sub[:i])
-                l = k + ncas_sub[i]
-                h1s, h1s_prime = h1rs[j], h1rs_prime[j]
-                dm1s = casdm1s.copy ()
-                dm1s[:,k:l,k:l] = 0.0 # no double-counting
-                dm1 = dm1s.sum (0)
-                h1s[:,:,:] = h1_core[j][:,k:l,k:l].copy ()
-                h1s[:,:,:] += np.tensordot (h2, dm1, axes=2)[None,k:l,k:l]
-                h1s[:,:,:] -= np.tensordot (dm1s, h2, axes=((1,2),(2,1)))[:,k:l,k:l]
-                #h1s[:,:,:] += h1s.transpose (0,2,1)
-                h1s[:,:,:] += h1s_prime[:,:,:]
-        Kci0 = self.Hci_all (None, h1frs, h2, self.ci)
-        Kci0 = [[Kc - c*(c.dot (Kc)) for Kc, c in zip (Kcr, cr)]
-                for Kcr, cr in zip (Kci0, self.ci)]
-        # ^ The definition of the unitary group generator compels you to do this always!!!
-        return Kci0
+        dh1rs_cas = self.h1rs_cas - self.h1s_cas[None,:,:,:]
+        dh1_core = -np.tensordot (kappa1_cas, dh1rs_cas, axes=((1),(2)))
+        dh1_core = dh1_core.transpose (1,2,0,3) + dh1_core.transpose (1,2,3,0)
+        for i, h1rs in enumerate (h1frs_prime):
+            j = sum (ncas_sub[:i])
+            k = j + ncas_sub[i]
+            h1rs[:,:,:,:] += dh1_core[:,:,j:k,j:k]
+        return lasci_sync.LASCI_HessianOperator.ci_response_offdiag (
+            self, kappa1, h1frs_prime)
 
     def orbital_response (self, kappa1, odm1s, ocm2, tdm1rs, tcm2, veff_prime):
         kappa2 = lasci_sync.LASCI_HessianOperator.orbital_response (

From 744ee44bea40860ca5d99f9b40de1be2f43a7a4f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 25 Jul 2024 12:14:40 -0500
Subject: [PATCH 51/78] minor cleanup

---
 my_pyscf/mcscf/lasscf_async/crunch.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index dc9ddea4..1a52bd2b 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -910,10 +910,10 @@ def _update_impurity_hamiltonian_(self, mo_coeff, ci, h2eff_sub=None, e_states=N
             casdm1rs=casdm1rs, casdm2rs=casdm2rs, weights=weights
         )
 
-    def get_grad_orb (las, mo_coeff=None, ci=None, h2eff_sub=None, veff=None, dm1s=None, hermi=-1):
-        gorb = lasci.LASCINoSymm.get_grad_orb (las, mo_coeff=mo_coeff, ci=ci, h2eff_sub=h2eff_sub,
-                                               veff=veff, dm1s=dm1s, hermi=hermi)
-        if mo_coeff is None: mo_coeff = las.mo_coeff
+    def get_grad_orb (las, **kwargs):
+        gorb = lasci.LASCINoSymm.get_grad_orb (las, **kwargs)
+        mo_coeff = kwargs.get ('mo_coeff', self.mo_coeff)
+        hermi = kwargs.get ('hermi', -1)
         nao, nmo = las.mo_coeff.shape
         ncore, ncas = las.ncore, las.ncas
         nocc = ncore + ncas
@@ -933,16 +933,10 @@ def get_grad_orb (las, mo_coeff=None, ci=None, h2eff_sub=None, veff=None, dm1s=N
         else:
             raise ValueError ("kwarg 'hermi' must = -1, 0, or +1")
 
-    def h1e_for_las (las, mo_coeff=None, ncas=None, ncore=None, nelecas=None, ci=None,
-                     ncas_sub=None, nelecas_sub=None, veff=None, h2eff_sub=None, casdm1s_sub=None,
-                     casdm1frs=None):
-        h1e_fr = lasci.LASCINoSymm.h1e_for_las (
-            las, mo_coeff=mo_coeff, ncas=ncas, ncore=ncore, nelecas=nelecas, ci=ci, 
-            ncas_sub=ncas_sub, nelecas_sub=nelecas_sub, veff=veff, h2eff_sub=h2eff_sub,
-            casdm1s_sub=casdm1s_sub, casdm1frs=casdm1frs
-        )
-        if mo_coeff is None: mo_coeff = self.mo_coeff
-        if ncas_sub is None: ncas_sub = self.ncas_sub
+    def h1e_for_las (las, **kwargs):
+        h1e_fr = lasci.LASCINoSymm.h1e_for_las (las, **kwargs)
+        mo_coeff = kwargs.get ('mo_coeff', self.mo_coeff)
+        ncas_sub = kwargs.get ('ncas_sub', self.ncas_sub)
         dh1_rs = np.dot (self.get_hcore_rs () - self.get_hcore ()[None,None,:,:], mo_coeff)
         dh1_rs = np.tensordot (mo_coeff.conj (), dh1_rs, axes=((0),(2))).transpose (1,2,0,3)
         for ix in range (len (ncas_sub)):

From 8de8e6647d776d6a36a9e8394e146f9584178bd0 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 25 Jul 2024 12:41:32 -0500
Subject: [PATCH 52/78] pair_lasci never inherits density fitting

Since you inevitably have to make the ERI array anyway
---
 my_pyscf/mcscf/lasscf_async/crunch.py | 40 ++++++++++++++-------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index 1a52bd2b..d27da03c 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -130,23 +130,25 @@ def _update_impham_1_(self, veff, dm1s, e_tot=None):
         df_eris_mem_error = MemoryError (("Density-fitted two-electron integrals in asynchronous "
                                           "LASSCF (outcore algorithm is not yet supported"))
         if getattr (mf, 'with_df', None) is not None:
+            # TODO: impurity outcore cderi
+            if not self._is_mem_enough (df_naux = mf.with_df.get_naoaux ()):
+                raise df_eris_mem_error
+            _cderi = np.empty ((mf.with_df.get_naoaux (), nimp*(nimp+1)//2),
+                               dtype=imporb_coeff.dtype)
+            ijmosym, mij_pair, moij, ijslice = ao2mo.incore._conc_mos (imporb_coeff, imporb_coeff,
+                                                                        compact=True)
+            b0 = 0
+            for eri1 in mf.with_df.loop ():
+                b1 = b0 + eri1.shape[0]
+                eri2 = _cderi[b0:b1]
+                eri2 = ao2mo._ao2mo.nr_e2 (eri1, moij, ijslice, aosym='s2', mosym=ijmosym,
+                                           out=eri2)
+                b0 = b1
             if getattr (self, 'with_df', None) is not None:
-                # TODO: impurity outcore cderi
-                if not self._is_mem_enough (df_naux = mf.with_df.get_naoaux ()):
-                    raise df_eris_mem_error
-                self.with_df._cderi = np.empty ((mf.with_df.get_naoaux (), nimp*(nimp+1)//2),
-                                                dtype=imporb_coeff.dtype)
-                ijmosym, mij_pair, moij, ijslice = ao2mo.incore._conc_mos (imporb_coeff, imporb_coeff,
-                                                                            compact=True)
-                b0 = 0
-                for eri1 in mf.with_df.loop ():
-                    b1 = b0 + eri1.shape[0]
-                    eri2 = self._cderi[b0:b1]
-                    eri2 = ao2mo._ao2mo.nr_e2 (eri1, moij, ijslice, aosym='s2', mosym=ijmosym,
-                                               out=eri2)
-                    b0 = b1
+                self.with_df._cderi = _cderi
             else:
-                self._eri = mf.with_df.ao2mo (imporb_coeff, compact=True)
+                self._cderi = _cderi
+                self._eri = np.dot (_cderi.conj ().T, _cderi)
         else:
             if getattr (mf, '_eri', None) is None:
                 if not mf._is_mem_enough ():
@@ -574,7 +576,7 @@ def get_vj_ext (self, mo_ext, dm1rs_ext, bmPu=None):
         if bmPu is not None:
             bPuu = np.tensordot (bmPu, mo_ext, axes=((0),(0)))
             rho = np.tensordot (dm1, bPuu, axes=((1,2),(1,2)))
-            bPii = self._scf.with_df._cderi
+            bPii = self._scf._cderi
             vj = lib.unpack_tril (np.tensordot (rho, bPii, axes=((-1),(0))))
         else: # Safety case: AO-basis SCF driver
             imporb_coeff = self.mol.get_imporb_coeff ()
@@ -996,7 +998,7 @@ def get_impurity_casscf (las, ifrag, imporb_builder=None):
     imc.__dict__.update (params.get (ifrag, {}))
     return imc
 
-def get_pair_lasci (las, frags):
+def get_pair_lasci (las, frags, inherit_df=False):
     stdout_dict = stdout = getattr (las, '_flas_stdout', None)
     if stdout is not None: stdout = stdout.get (frags, None)
     output = getattr (las.mol, 'output', None)
@@ -1006,12 +1008,12 @@ def get_pair_lasci (las, frags):
     if stdout is None and stdout_dict is not None:
         stdout_dict[frags] = imol.stdout
     imf = ImpurityHF (imol)
-    if isinstance (las, _DFLASCI):
+    if inherit_df and isinstance (las, _DFLASCI):
         imf = imf.density_fit ()
     ncas_sub = [las.ncas_sub[i] for i in frags]
     nelecas_sub = [las.nelecas_sub[i] for i in frags]
     ilas = ImpurityLASCI (imf, ncas_sub, nelecas_sub)
-    if isinstance (las, _DFLASCI):
+    if inherit_df and isinstance (las, _DFLASCI):
         ilas = lasci.density_fit (ilas, with_df=imf.with_df)
     charges, spins, smults, wfnsyms = lasci.get_space_info (las)
     ilas.state_average_(weights=las.weights, charges=charges[:,frags], spins=spins[:,frags],

From 46e9600f9735e668c95c23e479460c06c7062c6a Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 25 Jul 2024 13:01:52 -0500
Subject: [PATCH 53/78] keyframe.gradient_analysis log function

To facilitate fiddling w/t order of operations
---
 my_pyscf/mcscf/lasscf_async/keyframe.py     | 14 ++++++++++++++
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  1 +
 2 files changed, 15 insertions(+)

diff --git a/my_pyscf/mcscf/lasscf_async/keyframe.py b/my_pyscf/mcscf/lasscf_async/keyframe.py
index d7c96f8e..20332324 100644
--- a/my_pyscf/mcscf/lasscf_async/keyframe.py
+++ b/my_pyscf/mcscf/lasscf_async/keyframe.py
@@ -204,6 +204,20 @@ def _count (lbl, i, j):
 
     return ncommon_core, ncommon_active, ncommon_virt
 
+def gradient_analysis (las, kf, log):
+    ncore, ncas = las.ncore, las.ncas
+    nocc = ncore + ncas
+    gorb = kf.fock1 - kf.fock1.conj ().T
+    gci = las.get_grad_ci (mo_coeff=kf.mo_coeff, ci=kf.ci, h2eff_sub=kf.h2eff_sub, veff=kf.veff)
+    log.debug ('Inactive-virtual |g_orb|: %.15g', linalg.norm (gorb[:ncore,nocc:]))
+    for ifrag, gc in enumerate (gci):
+        i = ncore + sum (las.ncas_sub[:ifrag])
+        j = i + las.ncas_sub[ifrag]
+        log.debug ('Active fragment %d |g_orb|: %.15g ; |g_ci|: %.15g',
+                   ifrag, linalg.norm (gorb[i:j,:]), linalg.norm (gc))
+    return
+
+
 # Function from failed algorithm. May have a future use.
 def get_kappa (las, kf1, kf2):
     '''Decompose unitary matrix of orbital rotations between two keyframes as
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 44b261dc..20099e3f 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -84,6 +84,7 @@ def kernel (las, mo_coeff=None, ci0=None, conv_tol_grad=1e-4,
         gvec = las.get_grad (ugg=ugg, kf=kf1)
         norm_gvec = linalg.norm (gvec)
         log.info ('LASSCF macro %d : E = %.15g ; |g| = %.15g', it, e_tot, norm_gvec)
+        if verbose > lib.logger.INFO: keyframe.gradient_analysis (las, kf1, log)
         t1 = log.timer ('one LASSCF macro cycle', *t1)
         las.dump_chk (mo_coeff=kf1.mo_coeff, ci=kf1.ci)
         if norm_gvec < conv_tol_grad:

From a26559a4f6abb567b11812d5b3ca9f54fc01281f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 25 Jul 2024 13:52:44 -0500
Subject: [PATCH 54/78] lasscf_async combine_pair_max_frags member option

The algorithm converges massively faster if you optimize all
active-active relaxations simultaneously. However, this is maybe
not very-long-term-scalable because it involves Nroots*Nact^2 and
Nact^4 arrays. So the compromise is we can limit it to a certain
number of fragments at a time. It may be better to choose them
based on the Hessian than the gradient.
---
 my_pyscf/mcscf/lasscf_async/combine.py      | 48 +++++++++++++++++----
 my_pyscf/mcscf/lasscf_async/lasscf_async.py |  5 ++-
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index cca2e3eb..0262a62d 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -176,7 +176,9 @@ def combine_o0 (las, kf2_list):
     kf1 = relax (las, kf1)
     return kf1
 
-def select_aa_block (las, frags1, frags2, fock1):
+# Relaxing the fragments pairwise slows down optimization way too much in general
+# However, I might be able to get clever w/ memory management...
+def select_aa_block (las, frags1, frags2, fock1, max_frags=None):
     '''Identify from two lists of candidate fragments the single active-active orbital-rotation
     gradient block with the largest norm
 
@@ -186,12 +188,15 @@ def select_aa_block (las, frags1, frags2, fock1):
         frags2 : sequence of integers
         fock1 : ndarray of shape (nmo,nmo)
 
+    Kwargs:
+        max_frags : integer
+
     Returns:
-        i : integer
-            From frags1.
-        j : integer
-            From frags2.
+        aa_frags : set of integers
+            From frags1 and frags2
 '''
+    if max_frags is None: max_frags = getattr (las, 'combine_pair_max_frags', None)
+    if max_frags is None: max_frags = las.nfrags
     frags1 = list (frags1)
     frags2 = list (frags2)
     g_orb = fock1 - fock1.conj ().T
@@ -209,7 +214,33 @@ def select_aa_block (las, frags1, frags2, fock1):
     gmax = np.argmax (gblk)
     i = frags1[gmax // len (frags2)]
     j = frags2[gmax % len (frags2)]
-    return i, j
+    aa_frags = set ((i,j))
+
+    all_frags = sorted (frags1 + frags2)
+    max_frags = min (len (all_frags), max_frags)
+
+    if max_frags < 3: return aa_frags
+
+    all_frags.remove (i)
+    all_frags.remove (j)
+    nextra = max_frags - 2
+    idx = np.zeros (las.ncas, dtype=bool)
+    i0 = sum (las.ncas_sub[:i])
+    i1 = i0 + las.ncas_sub[i]
+    idx[i0:i1] = True
+    j0 = sum (las.ncas_sub[:j])
+    j1 = j0 + las.ncas_sub[j]
+    idx[j0:j1] = True
+    gblk = []
+    for k in all_frags:
+        k0 = sum (las.ncas_sub[:k])
+        k1 = k0 + las.ncas_sub[k]
+        gblk.append (linalg.norm (g_orb[k0:k1,idx]))
+    idx = np.argsort (-np.asarray (gblk))
+    new_frags = set (np.asarray (all_frags)[idx][:nextra])
+    aa_frags = aa_frags.union (new_frags)
+
+    return aa_frags
 
 def combine_pair (las, kf1, kf2, kf_ref=None):
     '''Combine two keyframes and relax one specific block of active-active orbital rotations
@@ -220,9 +251,9 @@ def combine_pair (las, kf1, kf2, kf_ref=None):
                   "({} {})").format (kf1.frags, kf2.frags)
         raise RuntimeError (errstr)
     kf3 = orth_orb (las, [kf1, kf2], kf_ref=kf_ref)
-    i, j = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
+    aa_frags = select_aa_block (las, kf1.frags, kf2.frags, kf3.fock1)
     #kf3 = relax (las, kf3, freeze_inactive=True, unfrozen_frags=(i,j))
-    pair = crunch.get_pair_lasci (las, (i,j))
+    pair = crunch.get_pair_lasci (las, tuple (aa_frags))
     pair._pull_keyframe_(kf3)
     if pair.conv_tol_grad == 'DEFAULT':
         # Default: scale down conv_tol_grad according to size of subproblem
@@ -230,7 +261,6 @@ def combine_pair (las, kf1, kf2, kf_ref=None):
         pair.conv_tol_grad = scale * las.conv_tol_grad
     pair.kernel ()
     kf3 = pair._push_keyframe (kf3)
-    kf3.frags = kf1.frags.union (kf2.frags)
     return kf3
 
 # Function from failed algorithm. Retained for reference
diff --git a/my_pyscf/mcscf/lasscf_async/lasscf_async.py b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
index 20099e3f..befc0857 100644
--- a/my_pyscf/mcscf/lasscf_async/lasscf_async.py
+++ b/my_pyscf/mcscf/lasscf_async/lasscf_async.py
@@ -182,6 +182,8 @@ class LASSCFNoSymm (lasci.LASCINoSymm):
         Key/value pairs are assigned as attributes to the active-active relaxation (``LASCI'')
         subproblem, similar to impurity_params. Use this to, e.g., set a different max_cycle_macro
         for the ``LASCI'' step.
+    combine_pair_max_frags : integer
+        Maximum number of frags to simultaneously relax during the combine_pair step.
     '''
     def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, **kwargs):
         lasci.LASCINoSymm.__init__(self, mf, ncas, nelecas, ncore=ncore, spin_sub=spin_sub,
@@ -192,7 +194,8 @@ def __init__(self, mf, ncas, nelecas, ncore=None, spin_sub=None, **kwargs):
         self.relax_params = {}
         for i, j in itertools.combinations (range (self.nfrags), 2):
             self.relax_params[(i,j)] = {}
-        keys = set (('frags_orbs','impurity_params','relax_params'))
+        self.combine_pair_max_frags = self.nfrags
+        keys = set (('frags_orbs','impurity_params','relax_params','combine_pair_max_frags'))
         self._keys = self._keys.union (keys)
 
     @property

From a5a2de88be49a7f7c8caa9a515b4fcc02c8ec4df Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 25 Jul 2024 14:03:08 -0500
Subject: [PATCH 55/78] TODO comment

---
 my_pyscf/mcscf/lasscf_async/combine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/my_pyscf/mcscf/lasscf_async/combine.py b/my_pyscf/mcscf/lasscf_async/combine.py
index 0262a62d..0d88c2e2 100644
--- a/my_pyscf/mcscf/lasscf_async/combine.py
+++ b/my_pyscf/mcscf/lasscf_async/combine.py
@@ -221,6 +221,8 @@ def select_aa_block (las, frags1, frags2, fock1, max_frags=None):
 
     if max_frags < 3: return aa_frags
 
+    # TODO: In future, when this becomes relevant, improve the selection:
+    # use Hessian; add fragments one-at-a-time, etc.
     all_frags.remove (i)
     all_frags.remove (j)
     nextra = max_frags - 2

From 3dfbf9a0f2f9ce4c5f70017421ae38a94cba297f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 29 Jul 2024 14:17:41 -0500
Subject: [PATCH 56/78] PySCF compat check

---
 pyscf_version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyscf_version.txt b/pyscf_version.txt
index 1ff9d86f..e92755fd 100644
--- a/pyscf_version.txt
+++ b/pyscf_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf.git@bf0b1db22556a3c1b4c34426ea8627e636c1b096
+git+https://github.com/pyscf/pyscf.git@ca8c7c1680defdfee2380eda3af3a28d9fb375cb

From 065a8eae6c445b057cea4a9fd478fa892e2fb94b Mon Sep 17 00:00:00 2001
From: Bhavnesh Jangid <bhavnesh@umn.edu>
Date: Tue, 30 Jul 2024 01:00:50 -0500
Subject: [PATCH 57/78] casdms can be read from tempfile

---
 my_pyscf/mcpdft/laspdft.py | 75 +++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 9 deletions(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index f3566e05..0fa2f646 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -7,6 +7,8 @@
 from copy import deepcopy
 from mrh.my_pyscf.df.sparse_df import sparsedf_array
 from mrh.my_pyscf.lassi import lassi
+import h5py
+import tempfile
 
 try:
     from pyscf.mcpdft.mcpdft import _PDFT, _mcscf_env
@@ -15,6 +17,26 @@
         "pyscf-forge can be found at : https://github.com/pyscf/pyscf-forge"
         raise ImportError(msg)
 
+def make_casdm1s(filename, i):
+    '''
+    This function stores the rdm1s for the given state 'i' in a tempfile
+    '''
+    with h5py.File(filename, 'r') as f:
+        rdm1s_key = f'rdm1s_{i}'
+        rdm1s = f[rdm1s_key][:]
+        rdm1s = np.array(rdm1s)
+    return rdm1s
+
+def make_casdm2s(filename, i):
+    '''
+    This function stores the rdm2s for the given state 'i' in a tempfile
+    '''
+    with h5py.File(filename, 'r') as f:
+        rdm2s_key = f'rdm2s_{i}'
+        rdm2s = f[rdm2s_key][:]
+        rdm2s = np.array(rdm2s)
+    return rdm2s
+
 class _LASPDFT(_PDFT):
     'MC-PDFT energy for a LASSCF wavefunction'
         
@@ -89,7 +111,8 @@ class PDFT(_LASPDFT, mc.__class__):
         _mc_class = mc.__class__
         setattr(_mc_class, 'DoLASSI', None)
         setattr(_mc_class, 'states', None)
-
+        setattr(_mc_class, 'rdmstmpfile', None)
+        
         def get_h2eff(self, mo_coeff=None):
             if self._in_mcscf_env: return mc.__class__.get_h2eff(self, mo_coeff=mo_coeff)
             else: return _LASPDFT.get_h2eff(self, mo_coeff=mo_coeff)
@@ -99,21 +122,53 @@ def compute_pdft_energy_(self, mo_coeff=None, ci=None, ot=None, otxc=None,
             return _LASPDFT.compute_pdft_energy_(self, mo_coeff=mo_coeff, ci=ci, ot=ot, otxc=otxc,
                              grids_level=grids_level, grids_attr=grids_attr, **kwargs)
 
-        if DoLASSI:  _mc_class.DoLASSI = True
+        if DoLASSI:  
+            _mc_class.DoLASSI = True
+            _mc_class.rdmstmpfile = tempfile.NamedTemporaryFile(dir=lib.param.TMPDIR)
+            
         else: _mc_class.DoLASSI = False
         
         if states is not None: _mc_class.states=states
 
         if _mc_class.DoLASSI:
-            # This code doesn't seem efficent, have to calculate the casdm1 and casdm2 in different functions.
+            
+            '''
+            Current RDM function for LASSI is generating the rdm1 and 2 for all the states.
+            The cost of this function is similar to LASSI diagonalization step. Therefore,
+            calling it 2n time for n-states becomes prohibitively expensive. One alternative 
+            can be just call it once and store all the generated casdm1 and casdm2 and later on
+            just call a reader function which will read the rdms from this temp file.
+            I have to make sure to delete or close this tempfile after the calculation, I 
+            will do that later.
+            '''
+            def _store_rdms(self):
+                rdm1s, rdm2s = lassi.roots_make_rdm12s(self, self.ci, self.si)
+                rdmstmpfile = self.rdmstmpfile
+                with h5py.File(rdmstmpfile, 'w') as f:
+                    for i in range(len(self.e_states)):
+                        rdm1s_dname = f'rdm1s_{i}'
+                        f.create_dataset(rdm1s_dname, data=rdm1s[i])
+                        rdm2s_dname = f'rdm2s_{i}'
+                        f.create_dataset(rdm2s_dname, data=rdm2s[i])
+     
+            # # This code doesn't seem efficent, have to calculate the casdm1 and casdm2 in different functions.
+            # def make_one_casdm1s(self, ci=None, state=0, **kwargs):
+                # with lib.temporary_env (self, verbose=2):
+                    # casdm1s = lassi.root_make_rdm12s (self, ci=ci, si=self.si, state=state)[0]
+                # return casdm1s
+            # def make_one_casdm2(self, ci=None, state=0, **kwargs):
+                # with lib.temporary_env (self, verbose=2):
+                    # casdm2s = lassi.root_make_rdm12s (self, ci=ci, si=self.si, state=state)[1]
+                # return casdm2s.sum ((0,3))
+            
             def make_one_casdm1s(self, ci=None, state=0, **kwargs):
-                with lib.temporary_env (self, verbose=2):
-                    casdm1s = lassi.root_make_rdm12s (self, ci=ci, si=self.si, state=state)[0]
-                return casdm1s
+                rdmstmpfile = self.rdmstmpfile
+                return make_casdm1s(rdmstmpfile, state)
+            
             def make_one_casdm2(self, ci=None, state=0, **kwargs):
-                with lib.temporary_env (self, verbose=2):
-                    casdm2s = lassi.root_make_rdm12s (self, ci=ci, si=self.si, state=state)[1]
-                return casdm2s.sum ((0,3))
+                rdmstmpfile = self.rdmstmpfile
+                return make_casdm2s(rdmstmpfile, state).sum ((0,3))
+                
         else:
             make_one_casdm1s=mc.__class__.state_make_casdm1s
             make_one_casdm2=mc.__class__.state_make_casdm2
@@ -125,6 +180,7 @@ def optimize_mcscf_(self, mo_coeff=None, ci0=None, **kwargs):
             Has the same calling signature as the parent kernel method. '''
             with _mcscf_env(self):
                 if self.DoLASSI:
+                    self._store_rdms()
                     self.fcisolver.nroots = len(self.e_states) if self.states is None else self.states
                     self.e_states = self.e_roots
                 else:
@@ -138,3 +194,4 @@ def optimize_mcscf_(self, mo_coeff=None, ci0=None, **kwargs):
     pdft._keys = pdft._keys.union(_keys)
     return pdft
 
+

From a800f32299a61aaaa9d36d9234f20ecc5acef598 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 30 Jul 2024 11:31:51 -0500
Subject: [PATCH 58/78] lassipdft simplify calling and memory management

Simplify the calling signature of mcpdft.LASSI so that you don't
have to type in ncas_sub and nelecas_sub again. Also implement a
memory management block loop in the lassipdft RDM constructor
function; if the memory is enough it should still do all the states
at once, but if not it'll split the states up by blocks.
---
 examples/laspdft/c2h4n4_si_laspdft.py |  4 ++--
 my_pyscf/mcpdft/__init__.py           |  6 +++--
 my_pyscf/mcpdft/laspdft.py            | 32 ++++++++++++++++++---------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/examples/laspdft/c2h4n4_si_laspdft.py b/examples/laspdft/c2h4n4_si_laspdft.py
index d60dd138..94b44f12 100755
--- a/examples/laspdft/c2h4n4_si_laspdft.py
+++ b/examples/laspdft/c2h4n4_si_laspdft.py
@@ -30,8 +30,8 @@
 lsi.kernel()
 
 # LASSI-PDFT
-mc = mcpdft.LASSI(lsi, 'tPBE', (3, 3), ((2,1),(1,2)), states=[0, 1])
-mc.kernel() 
+mc = mcpdft.LASSI(lsi, 'tPBE', states=[0, 1])
+mc.kernel()
 
 # CASCI-PDFT in las orbitals
 from pyscf import mcpdft
diff --git a/my_pyscf/mcpdft/__init__.py b/my_pyscf/mcpdft/__init__.py
index 600b5038..81a1a10d 100644
--- a/my_pyscf/mcpdft/__init__.py
+++ b/my_pyscf/mcpdft/__init__.py
@@ -107,8 +107,10 @@ def LASSCFPDFT(mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub,  ncore=None, spin_sub
     return _laspdftEnergy(LASSCF,  mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, ncore=ncore,
                           spin_sub=spin_sub, frozen=frozen, **kwargs)
 
-def LASSIPDFT(mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, ncore=None, spin_sub=None, frozen=None,
-        states=None, **kwargs):
+def LASSIPDFT(mc_or_mf_or_mol, ot, ncas_sub=None, nelecas_sub=None, ncore=None, spin_sub=None,
+              frozen=None, states=None, **kwargs):
+    if ncas_sub is None: ncas_sub = getattr (mc_or_mf_or_mol, 'ncas_sub', None)
+    if nelecas_sub is None: nelecas_sub = getattr (mc_or_mf_or_mol, 'nelecas_sub', None)
     from mrh.my_pyscf.mcscf.lasscf_o0 import LASSCF
     return _lassipdftEnergy(LASSCF,  mc_or_mf_or_mol, ot, ncas_sub, nelecas_sub, DoLASSI=True, ncore=ncore,
                           spin_sub=spin_sub, frozen=frozen, states=states, **kwargs)
diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index 0fa2f646..95d87caa 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -133,24 +133,34 @@ def compute_pdft_energy_(self, mo_coeff=None, ci=None, ot=None, otxc=None,
         if _mc_class.DoLASSI:
             
             '''
-            Current RDM function for LASSI is generating the rdm1 and 2 for all the states.
-            The cost of this function is similar to LASSI diagonalization step. Therefore,
+            The cost of the RDM build is similar to LASSI diagonalization step. Therefore,
             calling it 2n time for n-states becomes prohibitively expensive. One alternative 
             can be just call it once and store all the generated casdm1 and casdm2 and later on
             just call a reader function which will read the rdms from this temp file.
-            I have to make sure to delete or close this tempfile after the calculation, I 
-            will do that later.
             '''
             def _store_rdms(self):
-                rdm1s, rdm2s = lassi.roots_make_rdm12s(self, self.ci, self.si)
+                # MRH: I made it loop over blocks of states to handle the O(N^5) memory cost
+                # If there's enough memory it'll still do them all at once
+                log = lib.logger.new_logger (self, self.verbose)
+                mem_per_state = (2*(self.ncas**2) + 4*(self.ncas**4)) / 1e6
+                current_mem = lib.current_memory ()[0]
+                if current_mem > self.max_memory:
+                    log.warn ("Current memory usage (%d MB) exceeds maximum memory (%d MB)",
+                              mem_per_state, current_mem)
+                    nblk = 1
+                else:
+                    nblk = int ((self.max_memory - current_mem) / mem_per_state)
                 rdmstmpfile = self.rdmstmpfile
                 with h5py.File(rdmstmpfile, 'w') as f:
-                    for i in range(len(self.e_states)):
-                        rdm1s_dname = f'rdm1s_{i}'
-                        f.create_dataset(rdm1s_dname, data=rdm1s[i])
-                        rdm2s_dname = f'rdm2s_{i}'
-                        f.create_dataset(rdm2s_dname, data=rdm2s[i])
-     
+                    for i in range (0, len (self.e_states), nblk):
+                        rdm1s, rdm2s = lassi.roots_make_rdm12s(self, self.ci, self.si[:,i:i+nblk])
+                        for j in range(i*nblk, min((i+1)*nblk,len(self.e_states))):
+                            rdm1s_dname = f'rdm1s_{j}'
+                            f.create_dataset(rdm1s_dname, data=rdm1s[j])
+                            rdm2s_dname = f'rdm2s_{j}'
+                            f.create_dataset(rdm2s_dname, data=rdm2s[j])
+                        rdm1s = rdm2s = None     
+
             # # This code doesn't seem efficent, have to calculate the casdm1 and casdm2 in different functions.
             # def make_one_casdm1s(self, ci=None, state=0, **kwargs):
                 # with lib.temporary_env (self, verbose=2):

From 76a536925dda7f01e07d45f86c4bb50f2b150117 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 30 Jul 2024 11:38:45 -0500
Subject: [PATCH 59/78] fix indexing laspdft store_rdms

---
 my_pyscf/mcpdft/laspdft.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index 95d87caa..720d752c 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -153,12 +153,13 @@ def _store_rdms(self):
                 rdmstmpfile = self.rdmstmpfile
                 with h5py.File(rdmstmpfile, 'w') as f:
                     for i in range (0, len (self.e_states), nblk):
-                        rdm1s, rdm2s = lassi.roots_make_rdm12s(self, self.ci, self.si[:,i:i+nblk])
-                        for j in range(i*nblk, min((i+1)*nblk,len(self.e_states))):
-                            rdm1s_dname = f'rdm1s_{j}'
-                            f.create_dataset(rdm1s_dname, data=rdm1s[j])
-                            rdm2s_dname = f'rdm2s_{j}'
-                            f.create_dataset(rdm2s_dname, data=rdm2s[j])
+                        j = min (i+nblk, len (self.e_states))
+                        rdm1s, rdm2s = lassi.roots_make_rdm12s(self, self.ci, self.si[:,i:j])
+                        for k in range (i, j):
+                            rdm1s_dname = f'rdm1s_{k}'
+                            f.create_dataset(rdm1s_dname, data=rdm1s[k])
+                            rdm2s_dname = f'rdm2s_{k}'
+                            f.create_dataset(rdm2s_dname, data=rdm2s[k])
                         rdm1s = rdm2s = None     
 
             # # This code doesn't seem efficent, have to calculate the casdm1 and casdm2 in different functions.

From 4d454f232fe2b7ea629958c6aa7c8779710af971 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 30 Jul 2024 11:42:40 -0500
Subject: [PATCH 60/78] use correct rdm build function for specific states

---
 my_pyscf/mcpdft/laspdft.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index 720d752c..5aa956f4 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -154,7 +154,8 @@ def _store_rdms(self):
                 with h5py.File(rdmstmpfile, 'w') as f:
                     for i in range (0, len (self.e_states), nblk):
                         j = min (i+nblk, len (self.e_states))
-                        rdm1s, rdm2s = lassi.roots_make_rdm12s(self, self.ci, self.si[:,i:j])
+                        rdm1s, rdm2s = lassi.root_make_rdm12s(self, self.ci, self.si,
+                                                              state=list(range(i,j)))
                         for k in range (i, j):
                             rdm1s_dname = f'rdm1s_{k}'
                             f.create_dataset(rdm1s_dname, data=rdm1s[k])

From e13907185e122c2b3e31956e5606b2526e769706 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 30 Jul 2024 11:45:40 -0500
Subject: [PATCH 61/78] Fix warning message

---
 my_pyscf/mcpdft/laspdft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index 5aa956f4..f60831e2 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -146,7 +146,7 @@ def _store_rdms(self):
                 current_mem = lib.current_memory ()[0]
                 if current_mem > self.max_memory:
                     log.warn ("Current memory usage (%d MB) exceeds maximum memory (%d MB)",
-                              mem_per_state, current_mem)
+                              current_mem, self.max_memory)
                     nblk = 1
                 else:
                     nblk = int ((self.max_memory - current_mem) / mem_per_state)

From 6739154b067ca7a343977de9d5d9e650af6a00ca Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 30 Jul 2024 11:52:48 -0500
Subject: [PATCH 62/78] docstring missing line

---
 my_pyscf/lassi/lassi.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/my_pyscf/lassi/lassi.py b/my_pyscf/lassi/lassi.py
index 8c5f964d..8ed9f3a1 100644
--- a/my_pyscf/lassi/lassi.py
+++ b/my_pyscf/lassi/lassi.py
@@ -664,6 +664,9 @@ def root_make_rdm12s (las, ci, si, state=0, orbsym=None, soc=None, break_symmetr
                Linear combination vectors defining LASSI states.
 
         Kwargs:
+            state: integer or sequence of integers
+                Identify the specific LASSI eigenstate(s) for which the density matrices are
+                to be computed.
             orbsym: None or list of orbital symmetries spanning the whole orbital space
             soc: logical
                 Whether to include the effects of spin-orbit coupling (in the 1-RDMs only)

From 5d2eb957cc7e401b2810e2aba22816110c7f641e Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 30 Jul 2024 17:36:14 -0500
Subject: [PATCH 63/78] partial forward-comp pyscf #2342; pyscf-forge #57

Still need to figure out how to deal with laspdft chkfile, after
pyscf-forge #57 is fixed
---
 tests/fci/test_sanmix_casscf.py       | 41 ++++++++++++++-------------
 tests/mcpdft/test_grad_mcpdft_dupe.py |  4 +--
 tests/mcpdft/test_mcpdft_dupe.py      |  4 +--
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/tests/fci/test_sanmix_casscf.py b/tests/fci/test_sanmix_casscf.py
index 2b766cc5..6a05dcd9 100644
--- a/tests/fci/test_sanmix_casscf.py
+++ b/tests/fci/test_sanmix_casscf.py
@@ -4,25 +4,28 @@
 from mrh.my_pyscf.fci import csf_solver
 from mrh.my_pyscf.mcscf.addons import state_average_n_mix
 
-mol = gto.M (atom = 'O 0 0 0; H 1.145 0 0', basis='6-31g', symmetry=True, charge=-1, spin=0, verbose=0, output='/dev/null')
-mf = scf.RHF (mol).set (conv_tol=1e-10).run ()
-mc = mcscf.CASSCF (mf, 8, 8).set (conv_tol=1e-10).run ()
-
-anion = csf_solver (mol, smult=1)
-anion.wfnsym = 'A1'
-
-rad1 = csf_solver (mol, smult=2)
-rad1.spin = 1
-rad1.charge = 1
-rad1.wfnsym = 'E1x'
-
-rad2 = csf_solver (mol, smult=2)
-rad2.spin = 1
-rad2.charge = 1
-rad2.wfnsym = 'E1y'
-
-mc = state_average_n_mix (mc, [anion, rad1, rad2], [1.0/3.0,]*3)
-mc.kernel ()
+def setUpModule():
+    global mol, mf, mc, anion, rad1, rad2
+    mol = gto.M (atom = 'O 0 0 0; H 1.145 0 0', basis='6-31g', symmetry=True, charge=-1, spin=0, verbose=0, output='/dev/null')
+    mf = scf.RHF (mol).set (conv_tol=1e-10).run ()
+    mc = mcscf.CASSCF (mf, 8, 8).set (conv_tol=1e-10).run ()
+    mc.ci = None
+ 
+    anion = csf_solver (mol, smult=1)
+    anion.wfnsym = 'A1'
+    
+    rad1 = csf_solver (mol, smult=2)
+    rad1.spin = 1
+    rad1.charge = 1
+    rad1.wfnsym = 'E1x'
+    
+    rad2 = csf_solver (mol, smult=2)
+    rad2.spin = 1
+    rad2.charge = 1
+    rad2.wfnsym = 'E1y'
+    
+    mc = state_average_n_mix (mc, [anion, rad1, rad2], [1.0/3.0,]*3)
+    mc.kernel ()
 
 def tearDownModule():
     global mol, mf, mc, anion, rad1, rad2
diff --git a/tests/mcpdft/test_grad_mcpdft_dupe.py b/tests/mcpdft/test_grad_mcpdft_dupe.py
index f5a481f1..95b741b3 100644
--- a/tests/mcpdft/test_grad_mcpdft_dupe.py
+++ b/tests/mcpdft/test_grad_mcpdft_dupe.py
@@ -40,12 +40,12 @@ def auto_setup (xyz='Li 0 0 0\nH 1.5 0 0'):
     solver_S = fci.solver (mol_nosym, singlet=True).set (spin=0, nroots=2)
     solver_T = fci.solver (mol_nosym, singlet=False).set (spin=2, nroots=3)
     mcp_sa_1 = mcp_ss_nosym.state_average_mix (
-        [solver_S,solver_T], [1.0/5,]*5).run ()
+        [solver_S,solver_T], [1.0/5,]*5).set (ci=None).run ()
     solver_A1 = fci.solver (mol_sym).set (wfnsym='A1', nroots=3)
     solver_E1x = fci.solver (mol_sym).set (wfnsym='E1x', nroots=1, spin=2)
     solver_E1y = fci.solver (mol_sym).set (wfnsym='E1y', nroots=1, spin=2)
     mcp_sa_2 = mcp_ss_sym.state_average_mix (
-        [solver_A1,solver_E1x,solver_E1y], [1.0/5,]*5).run ()
+        [solver_A1,solver_E1x,solver_E1y], [1.0/5,]*5).set (ci=None).run ()
     mcp = [[mcp_ss_nosym, mcp_ss_sym], [mcp_sa_0, mcp_sa_1, mcp_sa_2]]
     nosym = [mol_nosym, mf_nosym, mc_nosym]
     sym = [mol_sym, mf_sym, mc_sym]
diff --git a/tests/mcpdft/test_mcpdft_dupe.py b/tests/mcpdft/test_mcpdft_dupe.py
index da025528..07857aa5 100644
--- a/tests/mcpdft/test_mcpdft_dupe.py
+++ b/tests/mcpdft/test_mcpdft_dupe.py
@@ -37,12 +37,12 @@ def auto_setup (xyz='Li 0 0 0\nH 1.5 0 0', fnal='tPBE'):
     solver_S = fci.solver (mol_nosym, singlet=True).set (spin=0, nroots=2)
     solver_T = fci.solver (mol_nosym, singlet=False).set (spin=2, nroots=3)
     mcp_sa_1 = mcp_ss_nosym.state_average_mix (
-        [solver_S,solver_T], [1.0/5,]*5).run (conv_tol=1e-8)
+        [solver_S,solver_T], [1.0/5,]*5).set (ci=None).run (conv_tol=1e-8)
     solver_A1 = fci.solver (mol_sym).set (wfnsym='A1', nroots=3)
     solver_E1x = fci.solver (mol_sym).set (wfnsym='E1x', nroots=1, spin=2)
     solver_E1y = fci.solver (mol_sym).set (wfnsym='E1y', nroots=1, spin=2)
     mcp_sa_2 = mcp_ss_sym.state_average_mix (
-        [solver_A1,solver_E1x,solver_E1y], [1.0/5,]*5).run (conv_tol=1e-8)
+        [solver_A1,solver_E1x,solver_E1y], [1.0/5,]*5).set (ci=None).run (conv_tol=1e-8)
     mcp = [[mcp_ss_nosym, mcp_ss_sym], [mcp_sa_0, mcp_sa_1, mcp_sa_2]]
     nosym = [mol_nosym, mf_nosym, mc_nosym]
     sym = [mol_sym, mf_sym, mc_sym]

From 771006f6392baa0cb0820e00a7b8649784c0962e Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 31 Jul 2024 15:55:03 -0500
Subject: [PATCH 64/78] LASSI op_o1 1 more time index line...

---
 my_pyscf/lassi/op_o1.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index b057ca0e..1d8320b9 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -707,6 +707,7 @@ def init_profiling (self):
         self.dt_o, self.dw_o = 0.0, 0.0
         self.dt_u, self.dw_u = 0.0, 0.0
         self.dt_p, self.dw_p = 0.0, 0.0
+        self.dt_i, self.dw_i = 0.0, 0.0
 
     def make_exc_tables (self, hopping_index):
         ''' Generate excitation tables. The nth column of each array is the (n+1)th argument of the
@@ -952,11 +953,14 @@ def _get_addr_range (self, raddr, *inv):
                 Indices of states with different excitation numbers in the fragments in *inv, with
                 all other fragments frozen in the zero state.
         '''
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         addr0, addr1 = self.offs_lroots[raddr]
         inv = list (set (inv))
         lroots = self.lroots[:,raddr:raddr+1]
         envaddr_inv = get_rootaddr_fragaddr (lroots[inv])[1]
         strides_inv = self.strides[raddr][inv]
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_i, self.dw_i = self.dt_i + dt, self.dw_i + dw
         return addr0 + np.dot (strides_inv, envaddr_inv)
 
     def _prepare_spec_addr_ovlp_(self, rbra, rket, *inv):
@@ -1418,6 +1422,7 @@ def sprint_profile (self):
         profile += '\n' + fmt_str.format ('ovlp', self.dt_o, self.dw_o)
         profile += '\n' + fmt_str.format ('umat', self.dt_u, self.dw_u)
         profile += '\n' + fmt_str.format ('put', self.dt_p, self.dw_p)
+        profile += '\n' + fmt_str.format ('idx', self.dt_i, self.dw_i)
         return profile
 
 class HamS2ovlpint (LSTDMint2):

From e0888abf09952062cfcb37640f7d73d460f1ae2c Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 13:06:54 -0500
Subject: [PATCH 65/78] lassi op_o1 profiling & cleanup

Some memory checking, improve time profiling, and delete unused
lines
---
 my_pyscf/lassi/op_o1.py | 71 ++++++++++++++++++++++++++++++-----------
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index 1d8320b9..e5d69287 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -649,9 +649,10 @@ class LSTDMint2 (object):
     # TODO: at some point, if it ever becomes rate-limiting, make this multithread better
 
     def __init__(self, ints, nlas, hopping_index, lroots, mask_bra_space=None, mask_ket_space=None,
-                 log=None, dtype=np.float64):
+                 log=None, max_memory=2000, dtype=np.float64):
         self.ints = ints
         self.log = log
+        self.max_memory = max_memory
         self.nlas = nlas
         self.norb = sum (nlas)
         self.lroots = lroots
@@ -937,7 +938,7 @@ def get_ovlp_fac (self, bra, ket, *inv):
         wgt *= fermion_frag_shuffle (self.nelec_rf[ket], uniq_frags)
         return wgt
 
-    def _get_addr_range (self, raddr, *inv):
+    def _get_addr_range (self, raddr, *inv, _profile=True):
         '''Get the integer offsets for successive ENVs in a particular rootspace in which some
         fragments are frozen in the zero state.
 
@@ -960,7 +961,7 @@ def _get_addr_range (self, raddr, *inv):
         envaddr_inv = get_rootaddr_fragaddr (lroots[inv])[1]
         strides_inv = self.strides[raddr][inv]
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
-        self.dt_i, self.dw_i = self.dt_i + dt, self.dw_i + dw
+        if _profile: self.dt_i, self.dw_i = self.dt_i + dt, self.dw_i + dw
         return addr0 + np.dot (strides_inv, envaddr_inv)
 
     def _prepare_spec_addr_ovlp_(self, rbra, rket, *inv):
@@ -981,6 +982,7 @@ def _prepare_spec_addr_ovlp_(self, rbra, rket, *inv):
         for rbra1, rket1 in braket_table:
             b, k, o = self._get_spec_addr_ovlp_1space (rbra1, rket1, *inv)
             self._spec_addr_ovlp_cache.append ((rbra1, rket1, b, k, o))
+        current_memory = lib.current_memory ()[0]
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
         self.dt_o, self.dw_o = self.dt_o + dt, self.dw_o + dw
         return
@@ -1014,8 +1016,6 @@ def _get_spec_addr_ovlp (self, bra, ket, *inv):
         rbra, rket = self.rootaddr[bra], self.rootaddr[ket]
         braenv = self.envaddr[bra]
         ketenv = self.envaddr[ket]
-        key = tuple ((rbra,rket)) + inv
-        braket_table = self.nonuniq_exc[key]
         bra_rng = []
         ket_rng = []
         facs = []
@@ -1061,8 +1061,8 @@ def _get_spec_addr_ovlp_1space (self, rbra, rket, *inv):
         spec = np.ones (self.nfrags, dtype=bool)
         for i in inv: spec[i] = False
         spec = np.where (spec)[0]
-        bra_rng = self._get_addr_range (rbra, *spec)
-        ket_rng = self._get_addr_range (rket, *spec)
+        bra_rng = self._get_addr_range (rbra, *spec, _profile=False)
+        ket_rng = self._get_addr_range (rket, *spec, _profile=False)
         specints = [self.ints[i] for i in spec]
         o = fac * np.ones ((1,1), dtype=self.dtype)
         for i in specints:
@@ -1443,9 +1443,10 @@ class HamS2ovlpint (LSTDMint2):
     # Hamiltonian in addition to h1 and h2, which are spin-symmetric
 
     def __init__(self, ints, nlas, hopping_index, lroots, h1, h2, mask_bra_space=None,
-                 mask_ket_space=None, log=None, dtype=np.float64):
+                 mask_ket_space=None, log=None, max_memory=2000, dtype=np.float64):
         LSTDMint2.__init__(self, ints, nlas, hopping_index, lroots, mask_bra_space=mask_bra_space,
-                           mask_ket_space=mask_ket_space, log=log, dtype=dtype)
+                           mask_ket_space=mask_ket_space, log=log, max_memory=max_memory,
+                           dtype=dtype)
         if h1.ndim==2: h1 = np.stack ([h1,h1], axis=0)
         self.h1 = h1
         self.h2 = h2
@@ -1548,9 +1549,10 @@ class LRRDMint (LSTDMint2):
     # spinorbital basis
 
     def __init__(self, ints, nlas, hopping_index, lroots, si, mask_bra_space=None,
-                 mask_ket_space=None, log=None, dtype=np.float64):
+                 mask_ket_space=None, log=None, max_memory=2000, dtype=np.float64):
         LSTDMint2.__init__(self, ints, nlas, hopping_index, lroots, mask_bra_space=mask_bra_space,
-                           mask_ket_space=mask_ket_space, log=log, dtype=dtype)
+                           mask_ket_space=mask_ket_space, log=log, max_memory=max_memory,
+                           dtype=dtype)
         self.nroots_si = si.shape[-1]
         self.si = si.copy ()
         self._umat_linequiv_loop_(self.si)
@@ -1605,14 +1607,14 @@ class ContractHamCI (LSTDMint2):
             Contains 2-electron Hamiltonian amplitudes in second quantization
     '''
     def __init__(self, ints, nlas, hopping_index, lroots, h1, h2, nbra=1,
-                 log=None, dtype=np.float64):
+                 log=None, max_memory=2000, dtype=np.float64):
         nfrags, _, nroots, _ = hopping_index.shape
         if nfrags > 2: raise NotImplementedError ("Spectator fragments in _crunch_1c_")
         nket = nroots - nbra
         HamS2ovlpint.__init__(self, ints, nlas, hopping_index, lroots, h1, h2,
                               mask_bra_space = list (range (nket, nroots)),
                               mask_ket_space = list (range (nket)),
-                              log=log, dtype=dtype)
+                              log=log, max_memory=max_memory, dtype=dtype)
         self.nbra = nbra
         self.hci_fr_pabq = self._init_vecs ()
 
@@ -1814,13 +1816,24 @@ def make_stdm12s (las, ci, nelec_frs, **kwargs):
     nlas = las.ncas_sub
     ncas = las.ncas
     nroots = nelec_frs.shape[1]
+    dtype = ci[0][0].dtype
+    max_memory = getattr (las, 'max_memory', las.mol.max_memory)
 
     # First pass: single-fragment intermediates
     hopping_index, ints, lroots = make_ints (las, ci, nelec_frs)
+    nstates = np.sum (np.prod (lroots, axis=0))
+
+    # Memory check
+    current_memory = lib.current_memory ()[0]
+    required_memory = dtype.itemsize*nstates*nstates*(2*(ncas**2)+4*(ncas**4))/1e6
+    if current_memory + required_memory > max_memory:
+        raise MemoryError ("current: {}; required: {}; max: {}".format (
+            current_memory, required_memory, max_memory))
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
-    outerprod = LSTDMint2 (ints, nlas, hopping_index, lroots, dtype=ci[0][0].dtype, log=log)
+    outerprod = LSTDMint2 (ints, nlas, hopping_index, lroots, dtype=dtype,
+                           max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LAS-state TDM12s second intermediate indexing setup', *t0)        
     tdm1s, tdm2s, t0 = outerprod.kernel ()
     lib.logger.timer (las, 'LAS-state TDM12s second intermediate crunching', *t0)        
@@ -1828,7 +1841,6 @@ def make_stdm12s (las, ci, nelec_frs, **kwargs):
         lib.logger.info (las, 'LAS-state TDM12s crunching profile:\n%s', outerprod.sprint_profile ())
 
     # Put tdm1s in PySCF convention: [p,q] -> q'p
-    nstates = np.sum (np.prod (lroots, axis=0))
     tdm1s = tdm1s.transpose (0,2,4,3,1)
     tdm2s = tdm2s.reshape (nstates,nstates,2,2,ncas,ncas,ncas,ncas).transpose (0,2,4,5,3,6,7,1)
     return tdm1s, tdm2s
@@ -1858,13 +1870,24 @@ def ham (las, h1, h2, ci, nelec_frs, **kwargs):
     '''
     log = lib.logger.new_logger (las, las.verbose)
     nlas = las.ncas_sub
+    max_memory = getattr (las, 'max_memory', las.mol.max_memory)
+    dtype = ci[0][0].dtype
 
     # First pass: single-fragment intermediates
     hopping_index, ints, lroots = make_ints (las, ci, nelec_frs)
+    nstates = np.sum (np.prod (lroots, axis=0))
+
+    # Memory check
+    current_memory = lib.current_memory ()[0]
+    required_memory = dtype.itemsize*nstates*nstates*3/1e6
+    if current_memory + required_memory > max_memory:
+        raise MemoryError ("current: {}; required: {}; max: {}".format (
+            current_memory, required_memory, max_memory))
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
-    outerprod = HamS2ovlpint (ints, nlas, hopping_index, lroots, h1, h2, dtype=ci[0][0].dtype, log=log)
+    outerprod = HamS2ovlpint (ints, nlas, hopping_index, lroots, h1, h2, dtype=dtype,
+                              max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LASSI Hamiltonian second intermediate indexing setup', *t0)        
     ham, s2, ovlp, t0 = outerprod.kernel ()
     lib.logger.timer (las, 'LASSI Hamiltonian second intermediate crunching', *t0)        
@@ -1896,13 +1919,24 @@ def roots_make_rdm12s (las, ci, nelec_frs, si, **kwargs):
     nlas = las.ncas_sub
     ncas = las.ncas
     nroots_si = si.shape[-1]
+    max_memory = getattr (las, 'max_memory', las.mol.max_memory)
+    dtype = ci[0][0].dtype
 
     # First pass: single-fragment intermediates
     hopping_index, ints, lroots = make_ints (las, ci, nelec_frs)
+    nstates = np.sum (np.prod (lroots, axis=0))
+
+    # Memory check
+    current_memory = lib.current_memory ()[0]
+    required_memory = dtype.itemsize*nroots_si*(2*(ncas**2)+4*(ncas**4))/1e6
+    if current_memory + required_memory > max_memory:
+        raise MemoryError ("current: {}; required: {}; max: {}".format (
+            current_memory, required_memory, max_memory))
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
-    outerprod = LRRDMint (ints, nlas, hopping_index, lroots, si, dtype=ci[0][0].dtype, log=log)
+    outerprod = LRRDMint (ints, nlas, hopping_index, lroots, si, dtype=dtype,
+                          max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LASSI root RDM12s second intermediate indexing setup', *t0)        
     rdm1s, rdm2s, t0 = outerprod.kernel ()
     lib.logger.timer (las, 'LASSI root RDM12s second intermediate crunching', *t0)
@@ -1964,8 +1998,9 @@ def contract_ham_ci (las, h1, h2, ci_fr_ket, nelec_frs_ket, ci_fr_bra, nelec_frs
 
     # Second pass: upper-triangle
     t0 = (lib.logger.process_clock (), lib.logger.perf_counter ())
+    max_memory = getattr (las, 'max_memory', las.mol.max_memory)
     contracter = ContractHamCI (ints, nlas, hopping_index, lroots, h1, h2, nbra=nbra,
-                                dtype=ci[0][0].dtype, log=log)
+                                dtype=ci[0][0].dtype, max_memory=max_memory, log=log)
     lib.logger.timer (las, 'LASSI Hamiltonian contraction second intermediate indexing setup', *t0)        
     hket_fr_pabq, t0 = contracter.kernel ()
     lib.logger.timer (las, 'LASSI Hamiltonian contraction second intermediate crunching', *t0)

From 54167a8f29fb9fe6ffd2bc5278711fde1a923345 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 13:08:11 -0500
Subject: [PATCH 66/78] delete unused line

---
 my_pyscf/lassi/op_o1.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index e5d69287..fb1b527c 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -982,7 +982,6 @@ def _prepare_spec_addr_ovlp_(self, rbra, rket, *inv):
         for rbra1, rket1 in braket_table:
             b, k, o = self._get_spec_addr_ovlp_1space (rbra1, rket1, *inv)
             self._spec_addr_ovlp_cache.append ((rbra1, rket1, b, k, o))
-        current_memory = lib.current_memory ()[0]
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
         self.dt_o, self.dw_o = self.dt_o + dt, self.dw_o + dw
         return

From 636804d296d4944e7c9ecc620fd6b18d040d5acb Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 13:09:11 -0500
Subject: [PATCH 67/78] better profile

---
 my_pyscf/lassi/op_o1.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index fb1b527c..10ac596e 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -960,9 +960,10 @@ def _get_addr_range (self, raddr, *inv, _profile=True):
         lroots = self.lroots[:,raddr:raddr+1]
         envaddr_inv = get_rootaddr_fragaddr (lroots[inv])[1]
         strides_inv = self.strides[raddr][inv]
+        addrs = addr0 + np.dot (strides_inv, envaddr_inv)
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
         if _profile: self.dt_i, self.dw_i = self.dt_i + dt, self.dw_i + dw
-        return addr0 + np.dot (strides_inv, envaddr_inv)
+        return addrs
 
     def _prepare_spec_addr_ovlp_(self, rbra, rket, *inv):
         '''Prepare the cache for _get_spec_addr_ovlp.

From f45fb0f5a23a7fc4fdeed604048f62c366c8a026 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 15:19:10 -0500
Subject: [PATCH 68/78] lassi time profiling

---
 my_pyscf/lassi/op_o1.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index 10ac596e..51d8a715 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -709,6 +709,8 @@ def init_profiling (self):
         self.dt_u, self.dw_u = 0.0, 0.0
         self.dt_p, self.dw_p = 0.0, 0.0
         self.dt_i, self.dw_i = 0.0, 0.0
+        self.dt_g, self.dw_g = 0.0, 0.0
+        self.dt_s, self.dw_s = 0.0, 0.0
 
     def make_exc_tables (self, hopping_index):
         ''' Generate excitation tables. The nth column of each array is the (n+1)th argument of the
@@ -1013,6 +1015,7 @@ def _get_spec_addr_ovlp (self, bra, ket, *inv):
         '''
         # NOTE: from tests on triene 3frag LASSI[3,3], this function is 1/4 to 1/6 of the "put"
         # runtime, and apparently it can sometimes multithread somehow???
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         rbra, rket = self.rootaddr[bra], self.rootaddr[ket]
         braenv = self.envaddr[bra]
         ketenv = self.envaddr[ket]
@@ -1028,6 +1031,8 @@ def _get_spec_addr_ovlp (self, bra, ket, *inv):
         bra_rng = np.concatenate (bra_rng)
         ket_rng = np.concatenate (ket_rng)
         facs = np.concatenate (facs)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_g, self.dw_g = self.dt_g + dt, self.dw_g + dw
         return bra_rng, ket_rng, facs
 
     def _get_spec_addr_ovlp_1space (self, rbra, rket, *inv):
@@ -1094,7 +1099,10 @@ def _put_D1_(self, bra, ket, D1, *inv):
         self.dt_p, self.dw_p = self.dt_p + dt, self.dw_p + dw
 
     def _put_SD1_(self, bra, ket, D1, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         self.tdm1s[bra,ket,:] += np.multiply.outer (wgt, D1)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 
     def _put_D2_(self, bra, ket, D2, *inv):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
@@ -1104,7 +1112,10 @@ def _put_D2_(self, bra, ket, D2, *inv):
         self.dt_p, self.dw_p = self.dt_p + dt, self.dw_p + dw
 
     def _put_SD2_(self, bra, ket, D2, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         self.tdm2s[bra,ket,:] += np.multiply.outer (wgt, D2)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 
     # Cruncher functions
     def _crunch_1d_(self, bra, ket, i):
@@ -1423,6 +1434,9 @@ def sprint_profile (self):
         profile += '\n' + fmt_str.format ('umat', self.dt_u, self.dw_u)
         profile += '\n' + fmt_str.format ('put', self.dt_p, self.dw_p)
         profile += '\n' + fmt_str.format ('idx', self.dt_i, self.dw_i)
+        profile += '\n' + 'Decomposing put:'
+        profile += '\n' + fmt_str.format ('gsao', self.dt_g, self.dw_g)
+        profile += '\n' + fmt_str.format ('putS', self.dt_s, self.dw_s)
         return profile
 
 class HamS2ovlpint (LSTDMint2):
@@ -1464,8 +1478,11 @@ def _put_D1_(self, bra, ket, D1, *inv):
         self.dt_p, self.dw_p = self.dt_p + dt, self.dw_p + dw
 
     def _put_ham_s2_(self, bra, ket, ham, s2, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         self.ham[bra,ket] += wgt * ham
         self.s2[bra,ket] += wgt * s2
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 
     def _put_D2_(self, bra, ket, D2, *inv):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
@@ -1556,16 +1573,23 @@ def __init__(self, ints, nlas, hopping_index, lroots, si, mask_bra_space=None,
         self.nroots_si = si.shape[-1]
         self.si = si.copy ()
         self._umat_linequiv_loop_(self.si)
+        self.si = np.asfortranarray (self.si)
 
     def _put_SD1_(self, bra, ket, D1, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         si_dm = self.si[bra,:] * self.si[ket,:].conj ()
         fac = np.dot (wgt, si_dm)
         self.rdm1s[:] += np.multiply.outer (fac, D1)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 
     def _put_SD2_(self, bra, ket, D2, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
         si_dm = self.si[bra,:] * self.si[ket,:].conj ()
         fac = np.dot (wgt, si_dm)
         self.rdm2s[:] += np.multiply.outer (fac, D2)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 
     def _add_transpose_(self):
         self.rdm1s += self.rdm1s.conj ().transpose (0,1,3,2)

From f5d09dbc51eb7f09fc3c0348e6b6e0ad7ac02d09 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 15:19:40 -0500
Subject: [PATCH 69/78] Create liblassi.so

---
 lib/CMakeLists.txt |  9 ++++++++
 lib/lassi/rdm.c    | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 lib/lassi/rdm.c

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index d1d460b9..26598b55 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -124,4 +124,13 @@ set_target_properties (clib_mrh_fsucc PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
     OUTPUT_NAME "fsucc")
 
+# Build the LASSI library
+set (LASSI_SOURCE_FILES "lassi/rdm.c")
+add_library (clib_mrh_lassi SHARED ${LASSI_SOURCE_FILES})
+target_link_libraries (clib_mrh_lassi ${LAPACK_LIBRARIES})
+set_target_properties (clib_mrh_lassi PROPERTIES
+    LINKER_LANGUAGE C
+    CLEAN_DIRECT_OUTPUT 1
+    LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}
+    OUTPUT_NAME "lassi")
 
diff --git a/lib/lassi/rdm.c b/lib/lassi/rdm.c
new file mode 100644
index 00000000..721ac3ee
--- /dev/null
+++ b/lib/lassi/rdm.c
@@ -0,0 +1,57 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <omp.h>
+#include <time.h>
+#include "../fblas.h"
+
+/*
+    # A C version of the below would need:
+    #   all args of _put_SD?_ 
+    #   self.si, in some definite order
+    #   length of _put_SD?_ args, ncas, nroots_si, maybe nstates?
+    # If I wanted to index down, I would also need
+    #   ncas_sub, nfrags, inv, len (inv)
+
+    def _put_SD1_(self, bra, ket, D1, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
+        si_dm = self.si[bra,:] * self.si[ket,:].conj ()
+        fac = np.dot (wgt, si_dm)
+        self.rdm1s[:] += np.multiply.outer (fac, D1)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
+        
+    def _put_SD2_(self, bra, ket, D2, wgt):
+        t0, w0 = logger.process_clock (), logger.perf_counter ()
+        si_dm = self.si[bra,:] * self.si[ket,:].conj ()
+        fac = np.dot (wgt, si_dm)
+        self.rdm2s[:] += np.multiply.outer (fac, D2)
+        dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
+        self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
+*/
+
+void LASSIRDMdputSD (double * SDsum, double * SDterm, int SDlen,
+                     double * sivec, int sivec_nbas, int sivec_nroots,
+                     int * bra, int * ket, double * wgt, int nelem)
+{
+    double * sicol = sivec;
+    double * SDtarget = SDsum;
+    double fac = 0;
+    const unsigned int i_one = 1;
+
+    for (int iroot = 0; iroot < sivec_nroots; iroot++){
+        sicol = sivec + (iroot*sivec_nbas);
+        SDtarget = SDsum + (iroot*SDlen);
+        fac = 0;
+        for (int ielem = 0; ielem < nelem; ielem++){
+            fac += sicol[bra[ielem]] * sicol[ket[ielem]] * wgt[ielem];
+        }
+        daxpy_(&SDlen, &fac, SDterm, &i_one, SDtarget, &i_one);
+    }
+
+}
+

From aa060789ec7a3b2c7ef5f664f2f1311b590b1197 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 16:47:52 -0500
Subject: [PATCH 70/78] Offload lassi make_rdm put_SD fn to C

---
 lib/lassi/rdm.c         |  2 +-
 my_pyscf/lassi/op_o1.py | 31 +++++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/lib/lassi/rdm.c b/lib/lassi/rdm.c
index 721ac3ee..4fa48879 100644
--- a/lib/lassi/rdm.c
+++ b/lib/lassi/rdm.c
@@ -36,7 +36,7 @@
 
 void LASSIRDMdputSD (double * SDsum, double * SDterm, int SDlen,
                      double * sivec, int sivec_nbas, int sivec_nroots,
-                     int * bra, int * ket, double * wgt, int nelem)
+                     long * bra, long * ket, double * wgt, int nelem)
 {
     double * sicol = sivec;
     double * SDtarget = SDsum;
diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index 51d8a715..0bee5597 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -9,6 +9,13 @@
 from mrh.my_pyscf.lassi.citools import get_lroots, get_rootaddr_fragaddr, umat_dot_1frag_
 import time
 
+# C interface
+import ctypes
+from mrh.lib.helper import load_library
+liblassi = load_library ('liblassi')
+def c_arr (arr): return arr.ctypes.data_as(ctypes.c_void_p)
+c_int = ctypes.c_int
+
 # NOTE: PySCF has a strange convention where
 # dm1[p,q] = <q'p>, but
 # dm2[p,q,r,s] = <p'r'sq>
@@ -1577,17 +1584,29 @@ def __init__(self, ints, nlas, hopping_index, lroots, si, mask_bra_space=None,
 
     def _put_SD1_(self, bra, ket, D1, wgt):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
-        si_dm = self.si[bra,:] * self.si[ket,:].conj ()
-        fac = np.dot (wgt, si_dm)
-        self.rdm1s[:] += np.multiply.outer (fac, D1)
+        #si_dm = self.si[bra,:] * self.si[ket,:].conj ()
+        #fac = np.dot (wgt, si_dm)
+        #self.rdm1s[:] += np.multiply.outer (fac, D1)
+        fn = liblassi.LASSIRDMdputSD
+        si_nrow, si_ncol = self.si.shape
+        fn (c_arr(self.rdm1s), c_arr(D1), c_int(D1.size),
+            c_arr(self.si), c_int(si_nrow), c_int(si_ncol),
+            c_arr(bra), c_arr(ket), c_arr (wgt),
+            c_int(len(wgt)))
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
         self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 
     def _put_SD2_(self, bra, ket, D2, wgt):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
-        si_dm = self.si[bra,:] * self.si[ket,:].conj ()
-        fac = np.dot (wgt, si_dm)
-        self.rdm2s[:] += np.multiply.outer (fac, D2)
+        #si_dm = self.si[bra,:] * self.si[ket,:].conj ()
+        #fac = np.dot (wgt, si_dm)
+        #self.rdm2s[:] += np.multiply.outer (fac, D2)
+        fn = liblassi.LASSIRDMdputSD
+        si_nrow, si_ncol = self.si.shape
+        fn (c_arr(self.rdm2s), c_arr(D2), c_int(D2.size),
+            c_arr(self.si), c_int(si_nrow), c_int(si_ncol),
+            c_arr(bra), c_arr(ket), c_arr (wgt),
+            c_int(len(wgt)))
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0
         self.dt_s, self.dw_s = self.dt_s + dt, self.dw_s + dw
 

From 6e7a55a81481220362cfa9d3ff4c829cabb9b9cc Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Thu, 1 Aug 2024 18:01:54 -0500
Subject: [PATCH 71/78] Explicit OMP in lassi/rdm.c (mistake?)

---
 lib/lassi/rdm.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/lib/lassi/rdm.c b/lib/lassi/rdm.c
index 4fa48879..f0886b49 100644
--- a/lib/lassi/rdm.c
+++ b/lib/lassi/rdm.c
@@ -9,6 +9,12 @@
 #include <time.h>
 #include "../fblas.h"
 
+#ifndef MINMAX
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MINMAX
+#endif
+
 /*
     # A C version of the below would need:
     #   all args of _put_SD?_ 
@@ -38,20 +44,33 @@ void LASSIRDMdputSD (double * SDsum, double * SDterm, int SDlen,
                      double * sivec, int sivec_nbas, int sivec_nroots,
                      long * bra, long * ket, double * wgt, int nelem)
 {
+    const unsigned int i_one = 1;
+
+    double fac = 0;
     double * sicol = sivec;
     double * SDtarget = SDsum;
-    double fac = 0;
-    const unsigned int i_one = 1;
 
     for (int iroot = 0; iroot < sivec_nroots; iroot++){
         sicol = sivec + (iroot*sivec_nbas);
         SDtarget = SDsum + (iroot*SDlen);
+
         fac = 0;
+
+        #pragma omp parallel for schedule(static) reduction(+:fac)
         for (int ielem = 0; ielem < nelem; ielem++){
             fac += sicol[bra[ielem]] * sicol[ket[ielem]] * wgt[ielem];
         }
-        daxpy_(&SDlen, &fac, SDterm, &i_one, SDtarget, &i_one);
+
+        //daxpy_(&SDlen, &fac, SDterm, &i_one, SDtarget, &i_one);
+        #pragma omp parallel
+        {
+            int nblk = omp_get_num_threads ();
+            nblk = (SDlen+nblk-1) / nblk;
+            int toff = nblk * omp_get_thread_num ();
+            nblk = MIN (SDlen, toff+nblk);
+            nblk = nblk - toff;
+            daxpy_(&nblk, &fac, SDterm+toff, &i_one, SDtarget+toff, &i_one);
+        }
     }
 
 }
-

From d83e5284925d9ff91ae6131f2b662e73b282be7f Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Fri, 2 Aug 2024 15:55:45 -0500
Subject: [PATCH 72/78] minor optimizations

---
 my_pyscf/lassi/op_o1.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/my_pyscf/lassi/op_o1.py b/my_pyscf/lassi/op_o1.py
index 0bee5597..7ae06525 100644
--- a/my_pyscf/lassi/op_o1.py
+++ b/my_pyscf/lassi/op_o1.py
@@ -1469,8 +1469,8 @@ def __init__(self, ints, nlas, hopping_index, lroots, h1, h2, mask_bra_space=Non
                            mask_ket_space=mask_ket_space, log=log, max_memory=max_memory,
                            dtype=dtype)
         if h1.ndim==2: h1 = np.stack ([h1,h1], axis=0)
-        self.h1 = h1
-        self.h2 = h2
+        self.h1 = np.ascontiguousarray (h1)
+        self.h2 = np.ascontiguousarray (h2)
 
     def _put_D1_(self, bra, ket, D1, *inv):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
@@ -1494,9 +1494,9 @@ def _put_ham_s2_(self, bra, ket, ham, s2, wgt):
     def _put_D2_(self, bra, ket, D2, *inv):
         t0, w0 = logger.process_clock (), logger.perf_counter ()
         ham = np.dot (self.h2.ravel (), D2.sum (0).ravel ()) / 2
-        M2 = np.einsum ('sppqq->s', D2) / 4
+        M2 = D2.diagonal (axis1=1,axis2=2).diagonal (axis1=1,axis2=2).sum ((1,2)) / 4
         s2 = M2[0] + M2[3] - M2[1] - M2[2]
-        s2 -= np.einsum ('pqqp->', D2[1] + D2[2]) / 2
+        s2 -= (D2[1]+D2[2]).diagonal (axis1=0,axis2=3).diagonal (axis1=0,axis2=1).sum () / 2
         bra1, ket1, wgt = self._get_spec_addr_ovlp (bra, ket, *inv)
         self._put_ham_s2_(bra1, ket1, ham, s2, wgt)
         dt, dw = logger.process_clock () - t0, logger.perf_counter () - w0

From 638c21685661eee6f5aca0faabdea355dd8a6919 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 5 Aug 2024 10:21:08 -0500
Subject: [PATCH 73/78] Issue #46 in citools._umat_dot_1frag

---
 my_pyscf/lassi/citools.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/my_pyscf/lassi/citools.py b/my_pyscf/lassi/citools.py
index cb38a2b3..e4b02015 100644
--- a/my_pyscf/lassi/citools.py
+++ b/my_pyscf/lassi/citools.py
@@ -100,12 +100,13 @@ def umat_dot_1frag_(target, umat, lroots, ifrag, iroot, axis=0):
 
 def _umat_dot_1frag (target, umat, lroots, ifrag):
     # Remember: COLUMN-MAJOR ORDER!!
-    old_shape = target.shape
-    new_shape = tuple (lroots[::-1]) + old_shape[1:]
-    target = target.reshape (*new_shape)
     iifrag = len (lroots) - ifrag - 1
-    newaxes = [iifrag,] + list (range (iifrag)) + list (range (iifrag+1, target.ndim))
-    oldaxes = list (np.argsort (newaxes))
-    target = target.transpose (*newaxes)
-    target = np.tensordot (umat.T, target, axes=1).transpose (*oldaxes)
+    old_shape = target.shape
+    new_shape = lroots[::-1]
+    nrow = np.prod (new_shape[:iifrag]).astype (int)
+    ncol = lroots[ifrag]
+    nstack = (np.prod (new_shape[iifrag:]) * np.prod (old_shape[1:])).astype (int) // ncol
+    new_shape = (nrow, ncol, nstack)
+    target = target.reshape (*new_shape).transpose (1,0,2)
+    target = np.tensordot (umat.T, target, axes=1).transpose (1,0,2)
     return target.reshape (*old_shape)

From b01c7e8b5b15c88c846d5d0bcaada173a53f2e2b Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 5 Aug 2024 12:02:35 -0500
Subject: [PATCH 74/78] PySCF(-forge) compat check

---
 pyscf-forge_version.txt | 2 +-
 pyscf_version.txt       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyscf-forge_version.txt b/pyscf-forge_version.txt
index cadad9b7..3085ef69 100644
--- a/pyscf-forge_version.txt
+++ b/pyscf-forge_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf-forge.git@039ba178d9327f96d1ba401fec21d2813c2dca12
+git+https://github.com/pyscf/pyscf-forge.git@1e47da09c9c2a79952915a7ed17e8215c45e42ab
diff --git a/pyscf_version.txt b/pyscf_version.txt
index e92755fd..c126f993 100644
--- a/pyscf_version.txt
+++ b/pyscf_version.txt
@@ -1 +1 @@
-git+https://github.com/pyscf/pyscf.git@ca8c7c1680defdfee2380eda3af3a28d9fb375cb
+git+https://github.com/pyscf/pyscf.git@1f65ec7a6df708aeaf1823e620ae770cdac5f9b6

From 401bca1bdb6b4b605ec4fbc787888b45410e159a Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Mon, 5 Aug 2024 14:15:54 -0500
Subject: [PATCH 75/78] integer overflow safety

---
 my_pyscf/lassi/op_o0.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/lassi/op_o0.py b/my_pyscf/lassi/op_o0.py
index 65f4cc76..460e44b2 100644
--- a/my_pyscf/lassi/op_o0.py
+++ b/my_pyscf/lassi/op_o0.py
@@ -40,7 +40,7 @@ def memcheck (las, ci, soc=None):
     else:
         nbytes = 2*nbytes_per_sfvec
     # memory load of ci_dp vectors
-    nbytes += sum ([np.prod ([c[iroot].size for c in ci])
+    nbytes += sum ([np.prod ([float (c[iroot].size) for c in ci])
                     * np.amax ([c[iroot].dtype.itemsize for c in ci])
                     for iroot in range (nroots)])
     safety_factor = 1.2

From 0f29cdf8187f2979781db2347293078e786fa656 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 6 Aug 2024 12:36:57 -0500
Subject: [PATCH 76/78] hotfix: I forgot float64 takes 8 bytes

---
 my_pyscf/mcpdft/laspdft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index f60831e2..8690dca7 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -142,7 +142,7 @@ def _store_rdms(self):
                 # MRH: I made it loop over blocks of states to handle the O(N^5) memory cost
                 # If there's enough memory it'll still do them all at once
                 log = lib.logger.new_logger (self, self.verbose)
-                mem_per_state = (2*(self.ncas**2) + 4*(self.ncas**4)) / 1e6
+                mem_per_state = 8*(2*(self.ncas**2) + 4*(self.ncas**4)) / 1e6
                 current_mem = lib.current_memory ()[0]
                 if current_mem > self.max_memory:
                     log.warn ("Current memory usage (%d MB) exceeds maximum memory (%d MB)",

From 947b98cc58d69d59ca0052deb4edeab12b61edc3 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Tue, 6 Aug 2024 13:08:26 -0500
Subject: [PATCH 77/78] Account for cache in lassi RDM builder

---
 my_pyscf/mcpdft/laspdft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/mcpdft/laspdft.py b/my_pyscf/mcpdft/laspdft.py
index 8690dca7..31d6fd87 100644
--- a/my_pyscf/mcpdft/laspdft.py
+++ b/my_pyscf/mcpdft/laspdft.py
@@ -149,7 +149,7 @@ def _store_rdms(self):
                               current_mem, self.max_memory)
                     nblk = 1
                 else:
-                    nblk = int ((self.max_memory - current_mem) / mem_per_state)
+                    nblk = max (1, int ((self.max_memory - current_mem) / mem_per_state)-1)
                 rdmstmpfile = self.rdmstmpfile
                 with h5py.File(rdmstmpfile, 'w') as f:
                     for i in range (0, len (self.e_states), nblk):

From 582b4c2e7ea56f52e42d110037caf88902377360 Mon Sep 17 00:00:00 2001
From: Matthew R Hermes <mrhermes@uchicago.edu>
Date: Wed, 7 Aug 2024 11:45:53 -0500
Subject: [PATCH 78/78] avoid assigning las.stdout to flas_stdout (#111)

---
 my_pyscf/mcscf/lasscf_async/crunch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/my_pyscf/mcscf/lasscf_async/crunch.py b/my_pyscf/mcscf/lasscf_async/crunch.py
index d27da03c..dbd08e4c 100644
--- a/my_pyscf/mcscf/lasscf_async/crunch.py
+++ b/my_pyscf/mcscf/lasscf_async/crunch.py
@@ -1005,7 +1005,7 @@ def get_pair_lasci (las, frags, inherit_df=False):
     if not ((output is None) or (output=='/dev/null')):
         output = output + '.' + '.'.join ([str (s) for s in frags])
     imol = ImpurityMole (las, output=output, stdout=stdout)
-    if stdout is None and stdout_dict is not None:
+    if stdout is None and output is not None and stdout_dict is not None:
         stdout_dict[frags] = imol.stdout
     imf = ImpurityHF (imol)
     if inherit_df and isinstance (las, _DFLASCI):