From b0d36e49d8c099ef948bbc7f3ddbfe25367fd94e Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 8 Aug 2024 16:57:06 +1000 Subject: [PATCH 1/5] start adding comments --- besmarts-core/python/besmarts/core/graphs.py | 13 +++++++ .../python/examples/07_besmarts_fit.py | 39 +++++++++++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/besmarts-core/python/besmarts/core/graphs.py b/besmarts-core/python/besmarts/core/graphs.py index f75c629..51c70ba 100644 --- a/besmarts-core/python/besmarts/core/graphs.py +++ b/besmarts-core/python/besmarts/core/graphs.py @@ -310,7 +310,20 @@ def graph_copy(beg: graph) -> graph: return g def graph_same(g: graph, h: graph) -> bool: + """ + Compare two graphs for equality. + + This function compares the nodes and edges of two graphs for equality. + Nodes and edges must satisfy the following conditions: + + 1. The nodes and edges must have the same keys. + 2. These keys must map to the same atom/edge primitives. + While the node indices must map to the same primitives, + and therefore the order of atoms in each graph must be the same, + the order of the node key itself in the ``graph`` structure can differ. + + """ if set(g.nodes).symmetric_difference(h.nodes): return False if set(g.edges).symmetric_difference(h.edges): diff --git a/besmarts-core/python/examples/07_besmarts_fit.py b/besmarts-core/python/examples/07_besmarts_fit.py index 802d2fb..4070de0 100644 --- a/besmarts-core/python/examples/07_besmarts_fit.py +++ b/besmarts-core/python/examples/07_besmarts_fit.py @@ -8,7 +8,7 @@ import tempfile import os -from typing import Dict, List +from typing import Dict, List, Optional from besmarts.mechanics import fits from besmarts.mechanics import smirnoff_models from besmarts.mechanics import molecular_models as mm @@ -58,7 +58,24 @@ """ -def load_xyz(flist, indices=None) -> assignments.graph_db_row: +def load_xyz(flist: list[str], indices=Optional[list[int]]) -> assignments.graph_db_row: + """ + Convert a list of xyz strings into a graph_db_row. + A graph_db_row is a collection of graph_db_columns, each of which + contains a selection of atoms and their coordinates. + + Parameters + ---------- + flist : list[str] + A list of xyz strings + indices: list[int] + A list of indices to use for the atoms. If None, the indices are + taken from the xyz files. + + Returns + ------- + assignments.graph_db_row + """ s = 0 N = None gdr = assignments.graph_db_row() @@ -83,7 +100,17 @@ def load_xyz(flist, indices=None) -> assignments.graph_db_row: s += 1 return gdr -def make(): +def make() -> dict[str, list[dict[int, str]]]: + """ + Make a dataset for the optimization. + This returns a dictionary with a single molecule. + The key is its SMILES string. The value is a list with one dictionary. + The dictionary keys are essentially + integer enum values for property types, + where the integer value represents the index of the property in a + :class:`besmarts.core.assignments.graph_db_entry`. + """ + global smi global xyz_positions global xyz_grad @@ -100,7 +127,11 @@ def make(): return d -def new_gdb(f: Dict[str, List[str]]) -> assignments.graph_db: +def new_gdb(f: Dict[str, List[Dict[int, str]]]) -> assignments.graph_db: + """ + This creates a ``graph_db`` from a dataset + as generated by ``make()``. + """ gcd = codec_rdkit.graph_codec_rdkit() gdb = assignments.graph_db() From 8f4c6786d8a099cd18507e5c4e1d8b3cd0be5de9 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 8 Aug 2024 16:57:37 +1000 Subject: [PATCH 2/5] remove redundant additions to graph db? --- besmarts-core/python/examples/07_besmarts_fit.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/besmarts-core/python/examples/07_besmarts_fit.py b/besmarts-core/python/examples/07_besmarts_fit.py index 4070de0..9083ac2 100644 --- a/besmarts-core/python/examples/07_besmarts_fit.py +++ b/besmarts-core/python/examples/07_besmarts_fit.py @@ -140,11 +140,6 @@ def new_gdb(f: Dict[str, List[Dict[int, str]]]) -> assignments.graph_db: g = gcd.smiles_decode(smi) gid = assignments.graph_db_add_graph(gdb, smi, g) - gdb.graphs[gid] = g - gdb.smiles[gid] = smi - gdb.selections[topology.index_of(topology.atom)] = { - gid: {k: v for k, v in enumerate(graphs.graph_atoms(g))} - } gde = assignments.graph_db_entry() gdb.entries[len(gdb.entries)] = gde for rid, rdata in enumerate(fn_dict): From 47dad24c3f580f1b1c865aff3c5d1e3377e9aa2f Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 8 Aug 2024 16:59:56 +1000 Subject: [PATCH 3/5] remove redundant enumerate --- besmarts-core/python/examples/07_besmarts_fit.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/besmarts-core/python/examples/07_besmarts_fit.py b/besmarts-core/python/examples/07_besmarts_fit.py index 9083ac2..95f6d22 100644 --- a/besmarts-core/python/examples/07_besmarts_fit.py +++ b/besmarts-core/python/examples/07_besmarts_fit.py @@ -136,13 +136,14 @@ def new_gdb(f: Dict[str, List[Dict[int, str]]]) -> assignments.graph_db: gdb = assignments.graph_db() for smi, fn_dict in f.items(): + # create a graph and add it to the graph_db + g: graphs.graph = gcd.smiles_decode(smi) + gid: int = assignments.graph_db_add_graph(gdb, smi, g) - g = gcd.smiles_decode(smi) - gid = assignments.graph_db_add_graph(gdb, smi, g) - + # start empty graph_db_entry gde = assignments.graph_db_entry() gdb.entries[len(gdb.entries)] = gde - for rid, rdata in enumerate(fn_dict): + for rdata in fn_dict: tid = assignments.POSITIONS gdt = assignments.graph_db_table(topology.atom) gdg = assignments.graph_db_graph() From 1f9ebe4205acb66613525bcbeebbed43692f04dc Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Fri, 9 Aug 2024 14:46:05 +1000 Subject: [PATCH 4/5] add types and comment gdb creation --- besmarts-core/python/besmarts/core/graphs.py | 8 +- .../python/besmarts/core/topology.py | 16 ++-- .../python/examples/07_besmarts_fit.py | 88 +++++++++++-------- 3 files changed, 67 insertions(+), 45 deletions(-) diff --git a/besmarts-core/python/besmarts/core/graphs.py b/besmarts-core/python/besmarts/core/graphs.py index 51c70ba..584cea2 100644 --- a/besmarts-core/python/besmarts/core/graphs.py +++ b/besmarts-core/python/besmarts/core/graphs.py @@ -45,7 +45,13 @@ def __init__( def __hash__(self) -> int: """ - Return the hash + Return the hash. + The hash is generated using the sorted + nodes and edges of the graph, so the hash should be invariant to + node and edge order. + + After being initially generated, the hash is cached for future use. + Parameters ---------- diff --git a/besmarts-core/python/besmarts/core/topology.py b/besmarts-core/python/besmarts/core/topology.py index ececef2..0163ab4 100644 --- a/besmarts-core/python/besmarts/core/topology.py +++ b/besmarts-core/python/besmarts/core/topology.py @@ -251,14 +251,14 @@ def pair_topology() -> structure_topology: # Singletons -null = null_topology() -atom = atom_topology() -bond = bond_topology() -angle = angle_topology() -torsion = torsion_topology() -outofplane = outofplane_topology() -pair = n_body_topology(2) -triplet = n_body_topology(3) +null: structure_topology = null_topology() +atom: structure_topology = atom_topology() +bond: structure_topology = bond_topology() +angle: structure_topology = angle_topology() +torsion: structure_topology = torsion_topology() +outofplane: structure_topology = outofplane_topology() +pair: structure_topology = n_body_topology(2) +triplet: structure_topology = n_body_topology(3) atom_to_atom = transcode_topology( atom, diff --git a/besmarts-core/python/examples/07_besmarts_fit.py b/besmarts-core/python/examples/07_besmarts_fit.py index 95f6d22..4486f27 100644 --- a/besmarts-core/python/examples/07_besmarts_fit.py +++ b/besmarts-core/python/examples/07_besmarts_fit.py @@ -107,7 +107,7 @@ def make() -> dict[str, list[dict[int, str]]]: The key is its SMILES string. The value is a list with one dictionary. The dictionary keys are essentially integer enum values for property types, - where the integer value represents the index of the property in a + where the integer value represents the table index (tid) of the property in a :class:`besmarts.core.assignments.graph_db_entry`. """ @@ -135,46 +135,62 @@ def new_gdb(f: Dict[str, List[Dict[int, str]]]) -> assignments.graph_db: gcd = codec_rdkit.graph_codec_rdkit() gdb = assignments.graph_db() - for smi, fn_dict in f.items(): + for smi, fn_list_of_dicts in f.items(): # create a graph and add it to the graph_db g: graphs.graph = gcd.smiles_decode(smi) - gid: int = assignments.graph_db_add_graph(gdb, smi, g) + # molecule_graph_id is the graph ID of the graph in the graph_db + molecule_graph_id: int = assignments.graph_db_add_graph(gdb, smi, g) - # start empty graph_db_entry + # start empty graph_db_entry that will contain + # positions, gradients, and energies gde = assignments.graph_db_entry() gdb.entries[len(gdb.entries)] = gde - for rdata in fn_dict: - tid = assignments.POSITIONS - gdt = assignments.graph_db_table(topology.atom) - gdg = assignments.graph_db_graph() - gdt.graphs[gid] = gdg - fn = rdata[tid] - # indices=dict(sorted([(j, x) for j, x in enumerate(g.nodes, 1)], key=lambda x: x[1])) - indices = None - gdr = load_xyz([fn], indices=indices) - gdg.rows[0] = gdr - gde.tables[tid] = gdt - tid = assignments.GRADIENTS - if tid in rdata: - gdt = assignments.graph_db_table(topology.atom) - gdg = assignments.graph_db_graph() - gdt.graphs[gid] = gdg - fn = rdata[tid] - # indices=dict(sorted([(j, x) for j, x in enumerate(g.nodes)], key=lambda x: x[1])) - gdr = load_xyz([fn], indices=indices) - gdg.rows[0] = gdr - gde.tables[tid] = gdt - gx = [x for y in gdr[0].selections.values() for x in y] - gdt.values.extend(gx) - tid = assignments.ENERGY - if tid in rdata: - gdt = assignments.graph_db_table(topology.null) - fn = rdata[tid] - ene = [*map(float, - [x for x in open(fn).read().split('\n') if x] - )] - gdt.values.extend(ene) - gde.tables[tid] = gdt + for rdata_dict in fn_list_of_dicts: + # === first add positions === + positions_index: int = assignments.POSITIONS # the index of the POSITIONS property in a graph_db_entry + # create new empty data table + positions_gdt = assignments.graph_db_table(topology.atom) + # create empty graph_db_graph. This holds data associated with a set of graphs + positions_gdg = assignments.graph_db_graph() + # add the graph_db_graph to the table of positions data + positions_gdt.graphs[molecule_graph_id] = positions_gdg + # read the positions xyz data from the dictionary + positions_gdr: graphs.graph_db_row = load_xyz([rdata_dict[positions_index]]) + # add the positions data to the graph_db_graph + positions_gdg.rows[0] = positions_gdr + # add the table to the graph_db_entry + gde.tables[positions_index] = positions_gdt + + # === check if gradient data is in the dictionary === + gradient_index: int = assignments.GRADIENTS + if gradient_index in rdata_dict: + gradient_gdt = assignments.graph_db_table(topology.atom) + gradient_gdg = assignments.graph_db_graph() + gradient_gdt.graphs[molecule_graph_id] = gradient_gdg + gradient_gdr = load_xyz([rdata_dict[gradient_index]]) + gradient_gdg.rows[0] = gradient_gdr + gde.tables[gradient_index] = gradient_gdt + gradient_graph_db_column = gradient_gdr[0] + flat_values: list[float] = [ + gradient + for gradient_list in gradient_graph_db_column.selections.values() + for gradient in gradient_list + ] + gradient_gdt.values.extend(flat_values) + + # === check if energy data is in the dictionary === + energy_index: int = assignments.ENERGY + if energy_index in rdata_dict: + energy_gdt = assignments.graph_db_table(topology.null) + energy_data_file = rdata_dict[energy_index] + # assume energy data is a file path + with open(energy_data_file, 'r') as f: + energy_data = f.read().split("\n") + # assume energy data is a list of floats with one energy per line + energy_values = [*map(float, [x for x in energy_data if x])] + energy_gdt.values.extend(energy_values) + gde.tables[energy_index] = energy_gdt + return gdb From 9c11575ba1eddfccea05d437eb6214b4705217c1 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Mon, 12 Aug 2024 14:47:58 +1000 Subject: [PATCH 5/5] add some more comments and signatures --- .../python/besmarts/mechanics/fits.py | 15 +++- .../besmarts/mechanics/molecular_models.py | 25 +++++- .../python/examples/07_besmarts_fit.py | 85 ++++++++++++++----- 3 files changed, 101 insertions(+), 24 deletions(-) diff --git a/besmarts-core/python/besmarts/mechanics/fits.py b/besmarts-core/python/besmarts/mechanics/fits.py index 7e34c84..7f36b0a 100644 --- a/besmarts-core/python/besmarts/mechanics/fits.py +++ b/besmarts-core/python/besmarts/mechanics/fits.py @@ -1725,7 +1725,11 @@ def ff_optimize( initial_objective: objective_tier, tiers: List[objective_tier], final_objective: objective_tier, -) -> mm.chemical_system: +) -> tuple[ + mm.chemical_system, + tuple[float, float], + tuple[float, float] + ]: """ The toplevel function to run a full BESMARTS force field fit. @@ -1740,6 +1744,15 @@ def ff_optimize( tiers: The tiers that will score each parameter candidate final_objective: The objective_tier that will score the remaining candidates passed by the tiers + + Returns + ------- + csys: mm.chemical_system + The final chemical system + physical_objectives: tuple[float, float] + The initial and final physical objectives + chemical_objectives: tuple[float, float] + The initial and final chemical objectives """ started = datetime.datetime.now() diff --git a/besmarts-core/python/besmarts/mechanics/molecular_models.py b/besmarts-core/python/besmarts/mechanics/molecular_models.py index 2114ee2..30fa0e4 100644 --- a/besmarts-core/python/besmarts/mechanics/molecular_models.py +++ b/besmarts-core/python/besmarts/mechanics/molecular_models.py @@ -459,7 +459,30 @@ def __init__(self, models: Dict[str, chemical_model], pcp_model): self.perception: perception.perception_model = pcp_model -def chemical_system_iter_keys(csys): +def chemical_system_iter_keys( + csys: chemical_system +) -> dict[tuple[int, str, int], Any]: + """ + Generate a flat mapping of keys and values of the system terms + in the chemical system. + + Parameters + ---------- + csys: :class:`chemical_system` + + Returns + ------- + dict[tuple[int, str, int], system_term] + The mapping of keys to system terms. + Each key is a tuple of three elements: + the model index (integer), + the term symbol (string), + the value index (integer). + Each value is the actual value of the term. + The type of each value can be found by looking up the + term in csys with + `csys.models[model_index].system_terms[term_symbol].cast`. + """ kv = {} for m, cm in enumerate(csys.models): for t in cm.system_terms: diff --git a/besmarts-core/python/examples/07_besmarts_fit.py b/besmarts-core/python/examples/07_besmarts_fit.py index 4486f27..5832580 100644 --- a/besmarts-core/python/examples/07_besmarts_fit.py +++ b/besmarts-core/python/examples/07_besmarts_fit.py @@ -58,7 +58,7 @@ """ -def load_xyz(flist: list[str], indices=Optional[list[int]]) -> assignments.graph_db_row: +def load_xyz(flist: list[str], indices: Optional[list[int]] = None) -> assignments.graph_db_row: """ Convert a list of xyz strings into a graph_db_row. A graph_db_row is a collection of graph_db_columns, each of which @@ -204,21 +204,53 @@ def run(): """ # 1 Build the dataset and FF - d = make() - csys = load_sage_csys() - gdb = new_gdb(d) - psys = fits.gdb_to_physical_systems(gdb, csys) + example_training_data: dict[str, list[dict]] = make() + + # load the Sage SMIRNOFF force field into a + # ``molecular_mechanics.chemical_system``. + # The ``chemical_system`` has the following models in order: + # bonds, angles, torsions, outofplanes, electrostatics, vdw. + # Each model is an instance of ``molecular_mechanics.chemical_model``. + csys: mm.chemical_system = load_sage_csys() + gdb: assignments.graph_db = new_gdb(example_training_data) + + # set up the physical systems + # a molecular_mechanics.physical_system is a collection of + # ``molecular_mechanics.physical_model`` objects. + # A physical model is the functional form that can be evaluated + # as a function of positions + psys_dict: dict[int, mm.physical_system] = fits.gdb_to_physical_systems( + gdb, csys + ) # 1 Configure the fitting strategy + # models should be a dictionary where the keys are the indices + # of each ``molecular_mechanics.chemical_model`` object + # in the ``csys``. Therefore, + # the 0 index here refers to the bond model. + # if we wanted to train an angle, we would need + # to set the key to 1. models = {0: ["b4"]} - strat = fits.forcefield_optimization_strategy_default(csys, models=models) - co = fits.chemical_objective + + # this sets up the default optimization strategy for the fit. + # It uses the ``models`` dictionary to determine which models + # to fit in the ``csys.models`` list. + # default settings are also applied to splitting, extending, perception + # configs etc. + strategy = fits.forcefield_optimization_strategy_default( + csys, models=models + ) + chemical_objective_function = fits.chemical_objective # 2. Configure the objective tiers final = fits.objective_tier() + # set the objectives for the final tier + # each value is an ``objective_config`` final.objectives = { 0: fits.objective_config_position( assignments.graph_db_address( + # eid stands for entry ID + # here it's set to 0 for the first molecule eid=[0], ), scale=1 @@ -231,48 +263,52 @@ def run(): ), } - fit_models = [0] + fit_models = [0] # 0 here indicates bonds final.fit_models = fit_models - final.fit_symbols = ["l"] + final.fit_symbols = ["l"] # l indicates equilibrium length onestep = fits.objective_tier() + # set same objectives as final tier onestep.objectives = final.objectives + # only do two fitting steps onestep.step_limit = 2 + # the top 3 candidates will be passed to the final tier onestep.accept = 3 + # fit the same models, symbols as the final tier onestep.fit_models = fit_models onestep.fit_symbols = ["l"] tiers = [onestep] initial = final - kv0 = mm.chemical_system_iter_keys(csys) + # get a flat mapping of parameter keys and values + # this gets the original values we can compare to later + original_values: dict = mm.chemical_system_iter_keys(csys) # 4. Optimize newcsys, (P0, P), (C0, C) = fits.ff_optimize( csys, gdb, - psys, - strat, - co, + psys_dict, + strategy, + chemical_objective_function, initial, tiers, final ) - - print("Modified parameters:") - kv = mm.chemical_system_iter_keys(newcsys) - for k, v in kv.items(): - v0 = kv0.get(k) + new_values = mm.chemical_system_iter_keys(newcsys) + for k, v in new_values.items(): + v0 = original_values.get(k) param_line = f"{str(k):20s} | New: {v:12.6g}" - if v0 is not None: + if v0 is not None: # if parameter is present in new FF, print difference dv = v-v0 if abs(dv) < 1e-7: - continue + continue # if parameter hasn't changed, don't print line = param_line + f" Ref {v0:12.6g} Diff {dv:12.6g}" - else: + else: # if parameter has disappeared line = param_line print(line) @@ -288,6 +324,11 @@ def run(): f"Physical {100*(P-P0)/P0:14.2f}%", f"Chemical {100*(C-C0)/C0:14.2f}%" ) + + # write out the final force field + smirnoff_models.smirnoff_write_version_0p3(newcsys, "sage_2.1_refit.offxml") + + xml = """ @@ -665,7 +706,7 @@ def run(): """ -def load_sage_csys(): +def load_sage_csys() -> mm.chemical_system: """ Load the OpenFF Sage 2.1 force field """