From b9fb4a8de657c5dd1bfe9aacd5fe7acfce8da3e4 Mon Sep 17 00:00:00 2001 From: Blazej Banaszewski Date: Mon, 5 Aug 2024 10:56:17 +0000 Subject: [PATCH] addressing memory leakage and dataloading speed --- ckpts/minimol_v1/config.yaml | 30 ++++++++++++++---------------- setup.py | 2 +- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/ckpts/minimol_v1/config.yaml b/ckpts/minimol_v1/config.yaml index 82d3f40..023cf75 100644 --- a/ckpts/minimol_v1/config.yaml +++ b/ckpts/minimol_v1/config.yaml @@ -10,13 +10,12 @@ accelerator: datamodule: module_type: "MultitaskFromSmilesDataModule" - # module_type: "FakeDataModule" # Option to use generated data args: # Matches that in the test_multitask_datamodule.py case. task_specific_args: # To be replaced by a new class "DatasetParams" l1000_vcap: df: null - df_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th4.csv - splits_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_VCAP_0-2_th4.csv + splits_path: graphium/data/neurips2023/large-dataset/l1000_vcap_random_splits.pt # df_path: graphium/data/neurips2023/foat_th4_reset_index/LINCS_L1000_VCAP_0-2_th4.csv # splits_path: graphium/data/neurips2023/foat_th4_reset_index/LINCS_L1000_VCAP_0-2_th4_no_admet_test_random_split.pt # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz @@ -29,8 +28,8 @@ datamodule: l1000_mcf7: df: null - df_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th4.csv - splits_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt + df_path: graphium/data/neurips2023/large-dataset/LINCS_L1000_MCF7_0-2_th4.csv + splits_path: graphium/data/neurips2023/large-dataset/l1000_mcf7_random_splits.pt # df_path: graphium/data/neurips2023/foat_th4_reset_index/LINCS_L1000_MCF7_0-2_th4.csv # splits_path: graphium/data/neurips2023/foat_th4_reset_index/LINCS_L1000_MCF7_0-2_th4_no_admet_test_random_split.pt # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz @@ -43,8 +42,8 @@ datamodule: pcba_1328: df: null - df_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet - splits_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt + df_path: graphium/data/neurips2023/large-dataset/PCBA_1328_1564k.parquet + splits_path: graphium/data/neurips2023/large-dataset/pcba_1328_random_splits.pt # df_path: graphium/data/neurips2023/foat_th4_reset_index/PCBA_1328_1564k.parquet # splits_path: graphium/data/neurips2023/foat_th4_reset_index/PCBA_1328_1564k_no_admet_test_random_split.pt # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet @@ -57,8 +56,8 @@ datamodule: pcqm4m_g25: df: null - df_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - splits_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # df_path: graphium/data/neurips2023/foat_th4_reset_index/PCQM4M_G25_N4.parquet # splits_path: graphium/data/neurips2023/foat_th4_reset_index/PCQM4M_G25_N4_no_admet_test_random_split.pt # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet @@ -74,8 +73,8 @@ datamodule: pcqm4m_n4: df: null - df_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet - splits_path: /home/blazejb/graphium/graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt + df_path: graphium/data/neurips2023/large-dataset/PCQM4M_G25_N4.parquet + splits_path: graphium/data/neurips2023/large-dataset/pcqm4m_g25_n4_random_splits.pt # df_path: graphium/data/neurips2023/foat_th4_reset_index/PCQM4M_G25_N4.parquet # splits_path: graphium/data/neurips2023/foat_th4_reset_index/PCQM4M_G25_N4_no_admet_test_random_split.pt # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet @@ -94,8 +93,8 @@ datamodule: batch_size_training: 2048 batch_size_inference: 2048 prepare_dict_or_graph: pyg:graph - featurization_n_jobs: 30 - featurization_progress: True + featurization_n_jobs: -1 + featurization_progress: False featurization_backend: "loky" dataloading_from: ram processed_graph_data_path: ${constants.datacache_path} @@ -130,8 +129,7 @@ datamodule: pos_type: rw_return_probs ksteps: 16 - num_workers: 16 # -1 to use all - # num_workers: -1 # -1 to use all + num_workers: -1 # -1 to use all persistent_workers: False # if use persistent worker at the start of each epoch. # Using persistent_workers false might make the start of each epoch very long. @@ -253,7 +251,7 @@ trainer: architecture: model_type: FullGraphMultiTaskNetwork - mup_base_path: ../ckpts/minimol_v1/base_shape.yaml + mup_base_path: ../minimol/ckpts/minimol_v1/base_shape.yaml mup_load_or_save: load pre_nn: # Set as null to avoid a pre-nn network out_dim: 128 diff --git a/setup.py b/setup.py index 1a3058b..9304082 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='minimol', - version='1.2', + version='1.3.1', packages=find_packages(), include_package_data=True, package_data={