From b8724226429e2005ee34e0e35bd9767b0eda0746 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sun, 23 Jul 2023 14:20:21 -0400 Subject: [PATCH] Multi instance | BindingDB from TDC --- data/bindingdb_kd/meta.yaml | 72 +++++++++++++++ data/bindingdb_kd/transform.py | 160 +++++++++++++++++++++++++++++++++ 2 files changed, 232 insertions(+) create mode 100644 data/bindingdb_kd/meta.yaml create mode 100644 data/bindingdb_kd/transform.py diff --git a/data/bindingdb_kd/meta.yaml b/data/bindingdb_kd/meta.yaml new file mode 100644 index 000000000..1166238ab --- /dev/null +++ b/data/bindingdb_kd/meta.yaml @@ -0,0 +1,72 @@ +name: bindingdb_kd +description: |- + BindingDB is a public, web-accessible database of + measured binding affinities, focusing chiefly on the interactions of + protein considered to be drug-targets with small, drug-like molecules. +targets: +- id: BindingDB_Kd + description: binding affinity of the given compound for a given target or protein + units: KD + type: continuous + names: + - noun: The strength of binding of a single molecule to its ligand + - noun: Drug potency for certain protein target + - verb: Inhibit certain protein + - verb: Change the functionality and protein conformation + - adjective: Inhibition of certain protein target + - adjective: Inhibition of certain protein target to change its function + uris: + - http://purl.obolibrary.org/obo/MI_0646 +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: SMILES + type: SMILES + description: SMILES +- id: Target_ID + type: Other + names: + - noun: protein target id + - noun: protein id + description: protein target id +- id: Target + type: Other + names: + - noun: protein sequence + - noun: protein fastq + description: protein sequence in fastq +license: CC BY 3.0 US. +links: +- url: https://doi.org/10.1093/nar/gkl999 + description: corresponding publication +- url: https://arxiv.org/abs/2004.08919 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/adme/#bbb-blood-brain-barrier-martins-et-al + description: data source +num_points: 47941 +bibtex: +- |- + @article{https://doi.org/10.48550/arxiv.2004.08919, + doi = {10.48550/ARXIV.2004.08919}, + url = {https://arxiv.org/abs/2004.08919}, + author = {Huang, Kexin and Fu, Tianfan and Glass, Lucas and Zitnik, Marinka and Xiao, Cao and Sun, Jimeng}, + keywords = {Machine Learning (cs.LG), Quantitative Methods (q-bio.QM), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Biological sciences, FOS: Biological sciences}, + title = {DeepPurpose: a Deep Learning Library for Drug-Target Interaction Prediction}, + publisher = {arXiv}, + year = {2020}, + copyright = {Creative Commons Attribution 4.0 International} +- |- + @article{Liu2007, + doi = {10.1093/nar/gkl999}, + url = {https://doi.org/10.1093/nar/gkl999}, + year = {2007}, + month = jan, + publisher = {Oxford University Press ({OUP})}, + volume = {35}, + number = {Database}, + pages = {D198--D201}, + author = {T. Liu and Y. Lin and X. Wen and R. N. Jorissen and M. K. Gilson}, + title = {BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities}, + journal = {Nucleic Acids Research} diff --git a/data/bindingdb_kd/transform.py b/data/bindingdb_kd/transform.py new file mode 100644 index 000000000..8390eb34c --- /dev/null +++ b/data/bindingdb_kd/transform.py @@ -0,0 +1,160 @@ +import pandas as pd +import yaml +from tdc.multi_pred import DTI + +def get_and_transform_data(): + target_subfolder ="BindingDB_Kd" + target_folder = str(target_subfolder).lower() + splits = DTI(name = target_subfolder).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) + + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=False) + del df + # create dataframe + df = pd.read_csv( + fn_data_original, + delimiter=",", + ) # not necessary but ensure we can load the saved data + # check if fields are the same + fields_orig = df.columns.tolist() + assert fields_orig == ['Drug_ID', 'Drug', 'Target_ID', 'Target', 'Y', 'split'] + # overwrite column names = fields + fields_clean = ['Drug_ID', 'SMILES', 'Target_ID', 'Target', 'BindingDB_Kd', 'split'] + df.columns = fields_clean + # data cleaning + # remove leading and trailing white space characters + df = df.dropna() + assert not df.duplicated().sum() + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "bindingdb_kd", # unique identifier, we will also use this for directory names + "description": """BindingDB is a public, web-accessible database of + measured binding affinities, focusing chiefly on the interactions of + protein considered to be drug-targets with small, drug-like molecules.""", + "targets": [ + { + "id": "BindingDB_Kd", # name of the column in a tabular dataset + "description": "binding affinity of the given compound for a given target or protein", + "units": "KD", # units of the values in this column (leave empty if unitless) + "type": "continuous", + "names": [ # names for the property (to sample from for building the prompts) + {"noun": "The strength of binding of a single molecule to its ligand"}, + {"noun": "Drug potency for certain protein target"}, + {"verb": "Inhibit certain protein"}, + {"verb": "Change the functionality and protein conformation"}, + {"adjective": "Inhibition of certain protein target"}, + { + "adjective": "Inhibition of certain protein target to change its function" + }, + ], + "uris": [ + "http://purl.obolibrary.org/obo/MI_0646" + + ], + }, + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "SMILES", # column name + "type": "SMILES", + "description": "SMILES", # description (optional, except for "Other") + }, + { + "id": "Target_ID", # column name + "type": "Other", + "names": [ + {"noun": "protein target id"}, + {"noun": "protein id"}, + ], + "description": "protein target id", # description (optional, except for "Other") + }, + { + "id": "Target", # column name + "type": "Other", + "names": [ + {"noun": "protein sequence"}, + {"noun": "protein fastq"}, + ], + "description": "protein sequence in fastq", # description (optional, except for "Other") + }, + ], + "license": "CC BY 3.0 US.", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1093/nar/gkl999", + "description": "corresponding publication", + }, + { + "url": "https://arxiv.org/abs/2004.08919", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/adme/#bbb-blood-brain-barrier-martins-et-al", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{https://doi.org/10.48550/arxiv.2004.08919, + doi = {10.48550/ARXIV.2004.08919}, + url = {https://arxiv.org/abs/2004.08919}, + author = {Huang, Kexin and Fu, Tianfan and Glass, Lucas and Zitnik, Marinka and Xiao, Cao and Sun, Jimeng}, + keywords = {Machine Learning (cs.LG), Quantitative Methods (q-bio.QM), Machine Learning (stat.ML), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Biological sciences, FOS: Biological sciences}, + title = {DeepPurpose: a Deep Learning Library for Drug-Target Interaction Prediction}, + publisher = {arXiv}, + year = {2020}, + copyright = {Creative Commons Attribution 4.0 International}""", + """@article{Liu2007, + doi = {10.1093/nar/gkl999}, + url = {https://doi.org/10.1093/nar/gkl999}, + year = {2007}, + month = jan, + publisher = {Oxford University Press ({OUP})}, + volume = {35}, + number = {Database}, + pages = {D198--D201}, + author = {T. Liu and Y. Lin and X. Wen and R. N. Jorissen and M. K. Gilson}, + title = {BindingDB: a web-accessible database of experimentally determined protein-ligand binding affinities}, + journal = {Nucleic Acids Research}""", + ] + } + + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data()