updating changes from gitlab

croningp · Aug 30, 2023 · 7cc62d5 · 7cc62d5
1 parent cf001a7
commit 7cc62d5
Show file tree

Hide file tree

Showing 7 changed files with 274 additions and 94 deletions.
diff --git a/bin/optimisers/host_guest_overlapping.py b/bin/optimisers/host_guest_overlapping.py
@@ -15,39 +15,59 @@
 import tqdm
 import pickle
 import numpy as np
+from datetime import datetime
 import tensorflow as tf
 from tensorflow.keras import backend as K
 
 from src.utils.optimiser_utils import load_host, load_vae_model, initial_population
 from src.utils.optimiser_utils import grad_ed_overlapping
 
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 
 if __name__ == "__main__":
 
-    BATCH_SIZE = 32
-    DATA_FOLDER = '/home/nvme/juanma/Data/ED/'  # in Auchentoshan
-    host = load_host(DATA_FOLDER+'cc6.pkl', BATCH_SIZE)
+    # folder where to save the logs of this run
+    startdate = datetime.now().strftime('%Y-%m-%d')
+    RUN_FOLDER = startdate
+
+    # just do a while loop to make sure the folder doesnt exist
+    n = 0
+    while os.path.exists(RUN_FOLDER+'_'+str(n)+'/'):
+        n += 1
+    RUN_FOLDER += '_'+str(n)+'/'
+    os.mkdir(RUN_FOLDER)
+
+    BATCH_SIZE = 48
+    # DATA_FOLDER = '/home/nvme/juanma/Data/ED/'  # in Auchentoshan
+    DATA_FOLDER = '/home/juanma/Data/' # in maddog2020
+    host = load_host(DATA_FOLDER+'cb6ED.pkl', BATCH_SIZE, expand_dims=True)
     vae, z_dim = load_vae_model('logs/vae/2021-05-25/')
 
     # noise_t = initial_population(BATCH_SIZE, random=True)
-    noise_t = initial_population(BATCH_SIZE, False, datapath=DATA_FOLDER, vae=vae)
+    # noise_t = initial_population(BATCH_SIZE, False, datapath=DATA_FOLDER, vae=vae)
+
+    # initial population
+    noise_t = K.random_uniform(shape = (BATCH_SIZE, z_dim),
+                            minval = -2.0, maxval = 2.0)
     _, _, initial_output = grad_ed_overlapping(noise_t, vae, host)
 
-    with open('initial_g.p', 'wb') as file:
+    with open(RUN_FOLDER+'initial_cb6ONLYED.p', 'wb') as file:
         pickle.dump(initial_output, file)
 
-    with open('initial_hg.p', 'wb') as file:
-        pickle.dump(initial_output+host, file)
+    # we will do five cycles of optimising
+    for factor in [1, 5, 10, 20, 50]:
+        lr = 0.05 / factor
+        slr = str(factor)
 
-    for i in tqdm.tqdm(range(10000)):
-        f, grads, output = grad_ed_overlapping(noise_t, vae, host)
-        print(np.mean(f.numpy()))
-        noise_t -= 0.05 * grads[0].numpy()
-        noise_t = np.clip(noise_t, a_min=-4.0, a_max=4.0)
+        for j in tqdm.tqdm(range(int(10000/factor))):
+            f, grads, output = grad_ed_overlapping(noise_t, vae, host)
+            print(np.mean(f.numpy()))
+            noise_t -= lr * grads[0].numpy()
+            noise_t = np.clip(noise_t, a_min=-5.0, a_max=5.0)
 
-        if i % 1000 == 0:
-            with open('optimized_g.p', 'wb') as file:
-                pickle.dump(output, file)
+            if j % 1000 == 0:
+                with open(RUN_FOLDER+'optimized_cb6ONLYED'+slr+'.p', 'wb') as file:
+                    pickle.dump(output, file)
 
-            with open('optimized_hg.p', 'wb') as file:
-                pickle.dump(output+host, file)
+    with open(RUN_FOLDER+'cb6_optimised_final.p', 'wb') as file:
+        pickle.dump(output, file)
diff --git a/bin/optimisers/maximise_hg_esp.py b/bin/optimisers/maximise_hg_esp.py
@@ -26,70 +26,78 @@
 if __name__ == "__main__":
 
     # factor that we will use to multiply the ED part of gradient descent.
-    # The ESP part will by multiplied by 1-ed_factor
-    ed_factor = 0.7
-
-    # folder where to save the logs of this run
-    startdate = datetime.now().strftime('%Y-%m-%d')
-    RUN_FOLDER = startdate +'_'+ str(ed_factor)
-
-    # just do a while loop to make sure the folder doesnt exist
-    n = 0
-    while os.path.exists(RUN_FOLDER+'_'+str(n)+'/'):
-        n += 1
-    RUN_FOLDER += '_'+str(n)+'/'
-    os.mkdir(RUN_FOLDER)
-
-    BATCH_SIZE = 32
-    DATA_FOLDER = '/home/nvme/juanma/Data/ED/'
-    # DATA_FOLDER = '/media/extssd/juanma/'
-
-    host_ed, host_esp = load_host_ed_esp(
-        DATA_FOLDER+'cc6.pkl', DATA_FOLDER+'cc6_esp.pkl', BATCH_SIZE)
-    vae, z_dim = load_vae_model('logs/vae/2021-05-25/')
-    ed_to_esp = load_ED_to_ESP('logs/vae_ed_esp/2021-07-18')
-
-    noise_t = K.random_uniform(shape = (BATCH_SIZE, z_dim),
-                             minval = -2.0, maxval = 2.0)
-    _, _, init_eds, init_esps = grad_esp_overlapping(noise_t, vae, ed_to_esp, host_esp)
-
-    with open(RUN_FOLDER+'cb6_esp_opt_initial_g.p', 'wb') as file:
-        pickle.dump(init_eds, file)
-
-    with open(RUN_FOLDER+'cb6_esp_opt_initial_g_esp.p', 'wb') as file:
-        pickle.dump(init_esps, file)
-
-    with open(RUN_FOLDER+'cb6_esp_opt_initial_hg.p', 'wb') as file:
-        pickle.dump(init_eds+host_ed, file)
-
-    # we will do five cycles of optimising
-    for factor in [1, 5, 10, 20, 50]:
-        lr = 0.05 / factor
-        slr = str(factor)
-
-        for j in tqdm.tqdm(range(int(10000/factor))):
-            # try to minimise overlapping ESP
-            f, grads, output, esps = grad_esp_overlapping(noise_t, vae, ed_to_esp, host_esp)
-            print(np.mean(f.numpy()))
-            noise_t -= lr * grads[0].numpy() * (1-ed_factor)
-            noise_t = np.clip(noise_t, a_min=-5.0, a_max=5.0)
-
-            if j % 1000 == 0:
-                with open(RUN_FOLDER+'cb6_esp_optimizedESPED'+slr+'.p', 'wb') as file:
-                    pickle.dump(output, file)
-                with open(RUN_FOLDER+'cb6_esp_optimizedESP'+slr+'.p', 'wb') as file:
-                    pickle.dump(esps, file)
-
-            # try to minimise overlapping ED
-            f, grads, output = grad_ed_overlapping(noise_t, vae, host_ed)
-            print(np.mean(f.numpy()))
-            noise_t -= lr * grads[0].numpy() * ed_factor
-            noise_t = np.clip(noise_t, a_min=-5.0, a_max=5.0)
-
-            if j % 1000 == 0:
-                with open(RUN_FOLDER+'cb6_esp_optimizedEDED'+slr+'.p', 'wb') as file:
-                    pickle.dump(output, file)
-
-    with open(RUN_FOLDER+'cb6_esp_optimized_final.p', 'wb') as file:
-        pickle.dump(output, file)
+    for ef in [0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1.0]:
+
+        # The ESP part will by multiplied by 1-ed_factor
+        ed_factor = ef
+
+        # folder where to save the logs of this run
+        startdate = datetime.now().strftime('%Y-%m-%d')
+        RUN_FOLDER = startdate +'_'+ str(ed_factor)
+
+        # just do a while loop to make sure the folder doesnt exist
+        n = 0
+        while os.path.exists(RUN_FOLDER+'_'+str(n)+'/'):
+            n += 1
+        RUN_FOLDER += '_'+str(n)+'/'
+        os.mkdir(RUN_FOLDER)
+
+        BATCH_SIZE = 48
+        # DATA_FOLDER = '/home/nvme/juanma/Data/ED/'
+        DATA_FOLDER = '/home/juanma/Data/' # in maddog2020 and dragonsoop
+
+        host_ed, host_esp = load_host_ed_esp(
+            DATA_FOLDER+'cb6ED.pkl', DATA_FOLDER+'cb6ESP.pkl', BATCH_SIZE, 
+            expand_dims=True, thicken=True)
+        vae, z_dim = load_vae_model('logs/vae/2021-05-25/')
+        ed_to_esp = load_ED_to_ESP('logs/vae_ed_esp/2021-07-18')
+
+        noise_t = K.random_uniform(shape = (BATCH_SIZE, z_dim),
+                                minval = -2.0, maxval = 2.0)
+        _, _, init_eds, init_esps = grad_esp_overlapping(noise_t, vae, ed_to_esp, host_esp)
+
+        with open(RUN_FOLDER+'cb6_esp_opt_initial_g_ed.p', 'wb') as file:
+            pickle.dump(init_eds, file)
+
+        with open(RUN_FOLDER+'cb6_esp_opt_initial_g_esp.p', 'wb') as file:
+            pickle.dump(init_esps, file)
+
+        with open(RUN_FOLDER+'cb6_esp_opt_initial_hg.p', 'wb') as file:
+            pickle.dump(init_eds+host_ed, file)
+
+        # we will do five cycles of optimising
+        for factor in [1, 5, 10, 20, 50]:
+            lr = 0.05 / factor
+            slr = str(factor)
+
+            for j in tqdm.tqdm(range(int(5000/factor))):
+                # try to minimise overlapping ESP
+                f, grads, output, esps = grad_esp_overlapping(noise_t, vae, ed_to_esp, host_esp)
+                print(np.mean(f.numpy()))
+                noise_t -= lr * grads[0].numpy() * (1-ed_factor)
+                noise_t = np.clip(noise_t, a_min=-5.0, a_max=5.0)
+
+                if j % 1000 == 0:
+                    with open(RUN_FOLDER+'cb6_esp_optimisedESPED'+slr+'.p', 'wb') as file:
+                        pickle.dump(output, file)
+                    with open(RUN_FOLDER+'cb6_esp_optimisedESPESP'+slr+'.p', 'wb') as file:
+                        pickle.dump(esps, file)
+
+                # try to minimise overlapping ED
+                f, grads, output = grad_ed_overlapping(noise_t, vae, host_ed)
+                print(np.mean(f.numpy()))
+                noise_t -= lr * grads[0].numpy() * ed_factor
+                noise_t = np.clip(noise_t, a_min=-5.0, a_max=5.0)
+
+                if j % 1000 == 0:
+                    with open(RUN_FOLDER+'cb6_esp_optimisedEDED'+slr+'.p', 'wb') as file:
+                        pickle.dump(output, file)
+
+        # this last one is just to save them
+        f, grads, output, esps = grad_esp_overlapping(noise_t, vae, ed_to_esp, host_esp)
+        with open(RUN_FOLDER+'cb6_esped_optimised_final.p', 'wb') as file:
+            pickle.dump(output, file)
+
+        with open(RUN_FOLDER+'cb6_espesp_optimised_final.p', 'wb') as file:
+            pickle.dump(esps, file)
 
diff --git a/env.sh b/env.sh
@@ -1,4 +1,4 @@
-CONDA_ENV_NAME=electrondensity
+CONDA_ENV_NAME=ed_tf12
 export PROJECT_ROOT=`pwd`
 export DATA_DIR='./data'
 export LOGS_DIR='./logs'

diff --git a/environmentTF12.yml b/environmentTF12.yml
@@ -0,0 +1,25 @@
+name: ed_tf12
+
+channels:
+    - rdkit
+    - conda-forge
+dependencies:
+    - cudatoolkit=11.8.0
+    - python=3.9
+    - ipython
+    - tqdm
+    - rdkit
+    - xtb=6.4.1
+    - requests
+    - argh
+    - matplotlib
+    - scipy
+    - cython
+    - traits=6.1.1
+    - traitsui=7.0.0
+    - mayavi
+    - pip
+    - pip:
+      - nvidia-cudnn-cu11==8.6.0.163
+      - tensorflow==2.12.*
+
diff --git a/src/utils/generate_ed_from_xyz.py b/src/utils/generate_ed_from_xyz.py
@@ -0,0 +1,108 @@
+##########################################################################################
+#
+# This script was used to obtain the electron densities and electrostatic potentials
+# from single XYZ files. We used it to generate the EDs and ESP from hosts.
+# To generate the EDs from a lot of files, like for example the QM9 dataset, do not use
+# this script, but the scripts we have in the Dataset folder.
+#
+# Author: Jarek & Juanma ([email protected])
+#
+##########################################################################################
+
+import numpy as np
+import shutil
+import os
+import pickle
+
+from src.datasets.utils.esp import ESP
+from src.datasets.utils.xtb import prepare_xtb_input, run_xtb
+from src.datasets.utils.orbkit import electron_density_from_molden
+import matplotlib.pyplot as plt
+
+
+
+def load_xyz(file_path, read_energies=False):
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+        lines = [l.strip('\n') for l in lines]
+
+    idx = 0
+    confs = []
+    energies = []
+    while idx < len(lines):
+        num_atoms = int(lines[idx].strip(' '))
+        if read_energies:
+            conf_energy = lines[idx+1].strip(' ')
+            conf_energy = float(conf_energy)
+            energies.append(conf_energy)
+        idx += 2
+        elements = []
+        coords = []
+
+        for lidx in range(num_atoms):
+            parsed_line = [l for l in lines[lidx+idx].split(' ') if l!='']
+            atom = parsed_line[0]
+            atom_coords = [float(c) for c in parsed_line[1:]]
+            elements.append(atom)
+            coords.append(atom_coords)
+        confs.append([elements, np.array(coords)])
+        idx += num_atoms
+    if not read_energies:
+        return confs
+    else:
+        return confs, energies
+
+
+if __name__ == '__main__':
+
+    # these values should be the same as the ones you used to generate the dataset
+    n_points = 64
+    step_size = 0.5
+
+    # class to calculate the electrostatic potentials
+    esp = ESP(n_points, step_size)  
+
+    # replace the file here with whatever host you want to convert
+    output_dir = "cb6_output"
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    # replace the file here with whatever host you want to convert
+    elements, coords = load_xyz('cb6v2.xyz')[0]
+    # xtb_input_file_path = 'input.xtb'
+    xtb_input_file_path = os.path.join(output_dir, 'input.xtb')
+
+    # this is to fit the the prepare_xtb_input function
+    coords = [[e]+list(c) for e, c in  zip(elements, coords)]
+
+    prepare_xtb_input(coords, xtb_input_file_path)
+    #xtb_exec_path = "/home/jarek/xtb-6.4.1/bin/xtb"
+    xtb_exec_path = shutil.which('xtb')
+    run_xtb(xtb_exec_path, xtb_input_file_path, output_dir, molden=True, esp=True)
+
+    molden_input = os.path.join(output_dir, 'molden.input')
+    rho = electron_density_from_molden(molden_input, n_points=n_points,
+                                           step_size=step_size)
+
+    # calculate esp cube from xtb
+    espxtb_input = os.path.join(output_dir, 'xtb_esp.dat')
+    molecule_esp = esp.calculate_espcube_from_xtb(espxtb_input)
+
+    with open('cb6ED.pkl', 'wb') as file:
+        pickle.dump(rho, file)
+
+    with open('cb6ESP.pkl', 'wb') as file:
+        pickle.dump(molecule_esp, file)
+
+    # plt.imsave('test.png', np.tanh(rho[32]))
+
+
+
+
+
+
+
+
+
+
+