From ef028547ff4459f6e98fe429d1564bd1d513fc31 Mon Sep 17 00:00:00 2001 From: Andrei Ivanov <32910461+drivanov@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:53:31 -0800 Subject: [PATCH] Fixed bug for `writer` initialized by `Chem.SDWriter(...)`. (#9929) Without the `writer.close()` statement, the file written by `writer` will not be closed properly. As a result, in our test the end of the file `/workspace/data/MoleculeGPT/raw/molecules.sdf` is missing. This is what it looks like: ``` 472184 RDKit 2D 1 0 0 0 0 0 0 0 0 0999 V2000 2.0000 0.0000 0.0000 Os 0 0 0 0 0 15 0 0 0 0 0 0 M CHG 1 1 4 M END > (4303) 472184 > (4303) 1 > (4303) 0 > (4303) 0 > (4303) 0 > (4303) 0 > (4303) AAADcQAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA== > ` character, and the last molecule (#4303) is missing. As a result, we get a crash later when running the test: ``` Traceback (most recent call last): File "/workspace/examples/llm/molecule_gpt.py", line 187, in train( File "/workspace/examples/llm/molecule_gpt.py", line 69, in train dataset = MoleculeGPTDataset(path) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch_geometric/datasets/molecule_gpt_dataset.py", line 217, in __init__ super().__init__(root, transform, pre_transform, pre_filter, File "/usr/local/lib/python3.12/dist-packages/torch_geometric/data/in_memory_dataset.py", line 81, in __init__ super().__init__(root, transform, pre_transform, pre_filter, log, File "/usr/local/lib/python3.12/dist-packages/torch_geometric/data/dataset.py", line 115, in __init__ self._process() File "/usr/local/lib/python3.12/dist-packages/torch_geometric/data/dataset.py", line 262, in _process self.process() File "/usr/local/lib/python3.12/dist-packages/torch_geometric/datasets/molecule_gpt_dataset.py", line 436, in process CAN_SMILES = mol.GetProp("PUBCHEM_OPENEYE_CAN_SMILES") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ KeyError: 'PUBCHEM_OPENEYE_CAN_SMILES' ``` --- torch_geometric/datasets/molecule_gpt_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch_geometric/datasets/molecule_gpt_dataset.py b/torch_geometric/datasets/molecule_gpt_dataset.py index fed2fe503600..0fe4c9b9d589 100644 --- a/torch_geometric/datasets/molecule_gpt_dataset.py +++ b/torch_geometric/datasets/molecule_gpt_dataset.py @@ -371,6 +371,7 @@ def extract_one_SDF_file(block_id: int) -> None: writer.write(mol) valid_mol_count += 1 + writer.close() print(f"block id: {block_id}\nfound {valid_mol_count}\n\n") sys.stdout.flush() return @@ -410,6 +411,7 @@ def extract_one_SDF_file(block_id: int) -> None: print(f"block id: {block_id} with 0 valid SDF file") continue + writer.close() print(f"In total: {len(found_CID_set)} molecules") # Step 05. Convert to PyG data format