From 30973e8246f1e893f279efa34ee27726d2a94ab5 Mon Sep 17 00:00:00 2001 From: ericzwang-google Date: Tue, 3 Jun 2025 18:02:55 -0400 Subject: [PATCH] Show example of converting regression output back to original label. --- ...TxGemma]Quickstart_with_Hugging_Face.ipynb | 165 ++++++++++++++++-- TxGemma/regression_conversion_parameters.json | 1 + 2 files changed, 156 insertions(+), 10 deletions(-) create mode 100644 TxGemma/regression_conversion_parameters.json diff --git a/TxGemma/[TxGemma]Quickstart_with_Hugging_Face.ipynb b/TxGemma/[TxGemma]Quickstart_with_Hugging_Face.ipynb index 962087ee..99b765bf 100644 --- a/TxGemma/[TxGemma]Quickstart_with_Hugging_Face.ipynb +++ b/TxGemma/[TxGemma]Quickstart_with_Hugging_Face.ipynb @@ -662,6 +662,56 @@ "print(response)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Run a regression task and convert output back to the original label**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://github.com/google-gemini/gemma-cookbook/blob/main/TxGemma/regression_conversion_parameters.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def denormalize_str(s: str | float, min_value: str | float, max_value: str | float) -> float:\n", + " convert_func = lambda s: (float(s)/1000.0) * (float(max_value) - float(min_value)) + float(min_value)\n", + " try:\n", + " value = convert_func(s)\n", + " except:\n", + " value = convert_func(500.)\n", + " return value\n", + "\n", + "parameters_file = \"regression_conversion_parameters.json\"\n", + "\n", + "if os.path.isfile(parameters_file):\n", + " with open(parameters_file, \"r\") as f:\n", + " reg_params = json.load(f)\n", + "\n", + " task_name = \"Caco2_Wang\"\n", + " prompt = tdc_prompts_json[task_name].replace(input_type, drug_smiles)\n", + " input_ids = tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\")\n", + " outputs = model.generate(**input_ids, max_new_tokens=8)\n", + " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " integer = response.split(\"Answer:\")[1].strip()\n", + "\n", + " print(f\"TxGemma Output: {integer}\")\n", + " print(f\"Denormalized output: {denormalize_str(integer, *reg_params['Caco2_Wang'])}\")\n", + "\n", + "else:\n", + " print(f\"{parameters_file} not found for regression example.\")\n" + ] + }, { "cell_type": "markdown", "id": "V4WlXpx2fGN4", @@ -710,7 +760,12 @@ "outputs": [ { "data": { - "text/markdown": "\n\n---\n\n", + "text/markdown": [ + "\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -720,7 +775,19 @@ }, { "data": { - "text/markdown": "**User:**\n\nInstructions: Answer the following question about drug properties.\nContext: As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier (BBB) is the protection layer that blocks most foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver to the site of action forms a crucial challenge in development of drugs for central nervous system.\nQuestion: Given a drug SMILES string, predict whether it\n(A) does not cross the BBB (B) crosses the BBB\nDrug SMILES: CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21\nAnswer:\n\n---\n\n", + "text/markdown": [ + "**User:**\n", + "\n", + "Instructions: Answer the following question about drug properties.\n", + "Context: As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier (BBB) is the protection layer that blocks most foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver to the site of action forms a crucial challenge in development of drugs for central nervous system.\n", + "Question: Given a drug SMILES string, predict whether it\n", + "(A) does not cross the BBB (B) crosses the BBB\n", + "Drug SMILES: CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21\n", + "Answer:\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -730,7 +797,14 @@ }, { "data": { - "text/markdown": "**TxGemma:**\n\n(B)\n\n---\n\n", + "text/markdown": [ + "**TxGemma:**\n", + "\n", + "(B)\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -740,7 +814,14 @@ }, { "data": { - "text/markdown": "**User:**\n\nExplain your reasoning based on the molecule structure.\n\n---\n\n", + "text/markdown": [ + "**User:**\n", + "\n", + "Explain your reasoning based on the molecule structure.\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -750,7 +831,24 @@ }, { "data": { - "text/markdown": "**TxGemma:**\n\nHere's the breakdown of why the drug with SMILES CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21 likely crosses the BBB:\n\n* **Small Size:** The molecule is relatively small, which is a general favorable characteristic for BBB penetration. Larger molecules have a harder time squeezing through the tight junctions between brain endothelial cells.\n* **Lipophilicity (Hydrophobicity):** The presence of multiple carbon and hydrogen atoms in the benzene rings and alkyl chains makes the molecule predominantly lipophilic (fat-loving). Lipophilicity is a key determinant of BBB permeability. The more lipophilic a drug, the easier it crosses the lipid-rich cell membranes of the BBB.\n* **Lack of Charged Groups:** The molecule lacks large, charged groups (like carboxyl or amine groups). Charged groups tend to be repelled by the lipid bilayer of the BBB, making it harder to cross. \n* **Absence of Specific BBB Targets:** While some drugs have specific transporters that help them across the BBB, this molecule doesn't appear to have any obvious targeting groups.\n\n**In summary, the drug's small size, lipophilic nature, lack of significant charge, and absence of specific BBB-targeting groups suggest that it likely possesses a good ability to cross the blood-brain barrier.**\n\n**Important Note:** This is a general prediction based on structural features. Actual BBB permeability is complex and can be influenced by many factors. Experimental validation is always necessary to confirm drug penetration ability. \n\n\n---\n\n", + "text/markdown": [ + "**TxGemma:**\n", + "\n", + "Here's the breakdown of why the drug with SMILES CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21 likely crosses the BBB:\n", + "\n", + "* **Small Size:** The molecule is relatively small, which is a general favorable characteristic for BBB penetration. Larger molecules have a harder time squeezing through the tight junctions between brain endothelial cells.\n", + "* **Lipophilicity (Hydrophobicity):** The presence of multiple carbon and hydrogen atoms in the benzene rings and alkyl chains makes the molecule predominantly lipophilic (fat-loving). Lipophilicity is a key determinant of BBB permeability. The more lipophilic a drug, the easier it crosses the lipid-rich cell membranes of the BBB.\n", + "* **Lack of Charged Groups:** The molecule lacks large, charged groups (like carboxyl or amine groups). Charged groups tend to be repelled by the lipid bilayer of the BBB, making it harder to cross. \n", + "* **Absence of Specific BBB Targets:** While some drugs have specific transporters that help them across the BBB, this molecule doesn't appear to have any obvious targeting groups.\n", + "\n", + "**In summary, the drug's small size, lipophilic nature, lack of significant charge, and absence of specific BBB-targeting groups suggest that it likely possesses a good ability to cross the blood-brain barrier.**\n", + "\n", + "**Important Note:** This is a general prediction based on structural features. Actual BBB permeability is complex and can be influenced by many factors. Experimental validation is always necessary to confirm drug penetration ability. \n", + "\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -805,7 +903,12 @@ "outputs": [ { "data": { - "text/markdown": "\n\n---\n\n", + "text/markdown": [ + "\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -815,7 +918,19 @@ }, { "data": { - "text/markdown": "**User:**\n\nInstructions: Answer the following question about drug properties.\nContext: As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier (BBB) is the protection layer that blocks most foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver to the site of action forms a crucial challenge in development of drugs for central nervous system.\nQuestion: Given a drug SMILES string, predict whether it\n(A) does not cross the BBB (B) crosses the BBB\nDrug SMILES: CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21\nAnswer:\n\n---\n\n", + "text/markdown": [ + "**User:**\n", + "\n", + "Instructions: Answer the following question about drug properties.\n", + "Context: As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier (BBB) is the protection layer that blocks most foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver to the site of action forms a crucial challenge in development of drugs for central nervous system.\n", + "Question: Given a drug SMILES string, predict whether it\n", + "(A) does not cross the BBB (B) crosses the BBB\n", + "Drug SMILES: CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21\n", + "Answer:\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -825,7 +940,14 @@ }, { "data": { - "text/markdown": "**TxGemma:**\n\n(B)\n\n---\n\n", + "text/markdown": [ + "**TxGemma:**\n", + "\n", + "(B)\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -835,7 +957,14 @@ }, { "data": { - "text/markdown": "**User:**\n\nExplain your reasoning based on the molecule structure.\n\n---\n\n", + "text/markdown": [ + "**User:**\n", + "\n", + "Explain your reasoning based on the molecule structure.\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] @@ -845,7 +974,23 @@ }, { "data": { - "text/markdown": "**TxGemma:**\n\nHere's the breakdown of why the drug with SMILES CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21 likely crosses the BBB:\n\n* **Small Size:** The molecule is relatively small, which is a general favorable characteristic for BBB penetration. Larger molecules have a harder time squeezing through the tight junctions between brain endothelial cells.\n* **Lipophilicity (Hydrophobicity):** The presence of multiple carbon and hydrogen atoms in the benzene rings and alkyl chains makes the molecule predominantly lipophilic (fat-loving). Lipophilicity is a key determinant of BBB permeability. The more lipophilic a drug, the easier it crosses the lipid-rich cell membranes of the BBB.\n* **Lack of Charged Groups:** The molecule lacks large, charged groups (like carboxyl or amine groups). Charged groups tend to be repelled by the lipid bilayer of the BBB, making it harder to cross. \n* **Absence of Specific BBB Targets:** While some drugs have specific transporters that help them across the BBB, this molecule doesn't appear to have any obvious targeting groups.\n\n**In summary, the drug's small size, lipophilic nature, lack of significant charge, and absence of specific BBB-targeting groups suggest that it likely possesses a good ability to cross the blood-brain barrier.**\n\n**Important Note:** This is a general prediction based on structural features. Actual BBB permeability is complex and can be influenced by many factors. Experimental validation is always necessary to confirm drug penetration ability.\n\n---\n\n", + "text/markdown": [ + "**TxGemma:**\n", + "\n", + "Here's the breakdown of why the drug with SMILES CN1C(=O)CN=C(C2=CCCCC2)c2cc(Cl)ccc21 likely crosses the BBB:\n", + "\n", + "* **Small Size:** The molecule is relatively small, which is a general favorable characteristic for BBB penetration. Larger molecules have a harder time squeezing through the tight junctions between brain endothelial cells.\n", + "* **Lipophilicity (Hydrophobicity):** The presence of multiple carbon and hydrogen atoms in the benzene rings and alkyl chains makes the molecule predominantly lipophilic (fat-loving). Lipophilicity is a key determinant of BBB permeability. The more lipophilic a drug, the easier it crosses the lipid-rich cell membranes of the BBB.\n", + "* **Lack of Charged Groups:** The molecule lacks large, charged groups (like carboxyl or amine groups). Charged groups tend to be repelled by the lipid bilayer of the BBB, making it harder to cross. \n", + "* **Absence of Specific BBB Targets:** While some drugs have specific transporters that help them across the BBB, this molecule doesn't appear to have any obvious targeting groups.\n", + "\n", + "**In summary, the drug's small size, lipophilic nature, lack of significant charge, and absence of specific BBB-targeting groups suggest that it likely possesses a good ability to cross the blood-brain barrier.**\n", + "\n", + "**Important Note:** This is a general prediction based on structural features. Actual BBB permeability is complex and can be influenced by many factors. Experimental validation is always necessary to confirm drug penetration ability.\n", + "\n", + "---\n", + "\n" + ], "text/plain": [ "" ] diff --git a/TxGemma/regression_conversion_parameters.json b/TxGemma/regression_conversion_parameters.json new file mode 100644 index 00000000..95c70102 --- /dev/null +++ b/TxGemma/regression_conversion_parameters.json @@ -0,0 +1 @@ +{"Caco2_Wang": ["-7.76", "-3.72948"], "VDss_Lombardo": ["0.01", "700.0"], "Half_Life_Obach": ["0.13", "1200.0"], "Clearance_Hepatocyte_AZ": ["3.0", "150.0"], "Clearance_Microsome_AZ": ["3.0", "150.0"], "LD50_Zhu": ["-0.343", "7.1"], "PPBR_AZ": ["12.63", "99.95"], "BindingDB_kd": ["2.0", "10.0"], "BindingDB_ic50": ["2.0", "10.0"], "BindingDB_ki": ["2.0", "10.0"], "DAVIS": ["5.0", "9.93554"], "USPTO_Yields": ["0.0", "0.9262238423"], "Buchwald_Hartwig": ["0.06", "1.0"], "OncoPolyPharmacology": ["-49.43829775", "71.29880244"], "DrugComb_CSS": ["-9.325965", "80.077045"], "DrugComb_HSA": ["-17.25489767", "15.41516102"], "DrugComb_Loewe": ["-65.01337758", "11.96498908"], "DrugComb_Bliss": ["-16.47171678", "16.74003994"], "DrugComb_ZIP": ["-12.48189979", "15.68920757"], "Protein_SAbDab": ["4.811506144", "9.989323594"], "Lipophilicity_AstraZeneca": ["-1.48", "4.5"], "Solubility_AqSolDB": ["-11.99893786", "1.9675127"], "TAP_CDR_Length": ["39.0", "60.0"], "TAP_PSH": ["83.8389", "173.8525"], "TAP_PPC": ["0.0", "3.1617"], "TAP_PNC": ["0.0", "3.4997"], "TAP_SFvCSP": ["-20.4", "36.0"], "Leenay_Fraction_Insertions": ["0.0011676397", "0.8024803216"], "Leenay_Avg_Insertion_Length": ["0.0", "20.7738817"], "Leenay_Avg_Deletion_Length": ["2.110951518", "46.02702703"], "Leenay_Indel_Diversity": ["0.8654522802", "5.548347734"], "Leenay_Fraction_Frameshifts": ["0.001676409904", "0.940090951"], "KIBA": ["7.1", "17.2001795"], "BindingDB_Patent": ["-11.51291547", "16.11809565"]} \ No newline at end of file