From fb31f3c502c5386669461eee772d7c40ae318e3d Mon Sep 17 00:00:00 2001 From: hmacdope Date: Tue, 17 Sep 2024 13:07:23 +1000 Subject: [PATCH 01/13] fex env --- devtools/conda-envs/falcbot.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index 0ebe881..de4de99 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -48,6 +48,7 @@ dependencies: - alchemiscale-client - cinnabar >=0.4.1 - openeye-toolkits + - openfe # other asapdiscovery deps - distributed @@ -59,11 +60,11 @@ dependencies: # Pip-only installs - pip: - - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy - - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-data&subdirectory=asapdiscovery-data - - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling - - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking - - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-data&subdirectory=asapdiscovery-data + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml From 20b8ccbb3e3238365417e4d9f6659bb9e5ba9be5 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Wed, 18 Sep 2024 20:31:44 +1000 Subject: [PATCH 02/13] start of llm --- devtools/conda-envs/falcbot.yaml | 4 ++ falcbot/falcbot.py | 53 +++++++++--------- falcbot/llm.py | 95 ++++++++++++++++++++++++++++++++ falcbot/util.py | 13 +++++ 4 files changed, 139 insertions(+), 26 deletions(-) create mode 100644 falcbot/llm.py create mode 100644 falcbot/util.py diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index de4de99..c8707fd 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -66,5 +66,9 @@ dependencies: - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml + # llm + - langchain + - langchain_core + - langchain_openai diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index 471c2f3..f999d19 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -28,6 +28,8 @@ from asapdiscovery.ml.inference import GATInference, SchnetInference from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags from asapdiscovery.ml.models import ASAPMLModelRegistry +from .llm import _BASIC_ML_LLM +from .util import _rdkit_smiles_roundtrip, _is_valid_smiles # from falcbot.sqlite_db import connect_sqlite_db, insert_series, create_series_table @@ -148,17 +150,6 @@ def _link_to_block_data(link, text): } -def _is_valid_smiles(smi): - m = Chem.MolFromSmiles(smi) - if m is None: - return False - else: - return True - - -def _rdkit_smiles_roundtrip(smi: str) -> str: - mol = Chem.MolFromSmiles(smi) - return Chem.MolToSmiles(mol) def are_you_alive_matcher(event, logger, context): @@ -508,42 +499,52 @@ def debug_series(event, say, context, logger): return -def make_pic50_pred_matcher(event, logger, context): +def pred_matcher(event, logger, context): # regex for any instance of help, case insensitive with optional spaces msg = event.get("text", None) if not msg: return False - pattern = r"(?i)predict pIC50 for SMILES" + pattern = r"(?i)predict" match = re.search(pattern, msg) return match -@app.event("app_mention", matchers=[make_pic50_pred_matcher]) +@app.event("app_mention", matchers=[pred_matcher]) def make_pic50_pred(event, say, context, logger): content = event.get("text") - # parse message for molset using regex - pattern = r"(?i)SMILES\s+(.+?)\s+for\s+target\s+(.+)" - match = re.search(pattern, content) - if match: - smiles = match.group(1) - target = match.group(2) - else: - say("Could not find SMILES and Target in the message, unable to proceed") + # parse with LLM + worked, model = _BASIC_ML_LLM.query(content) + if not worked: + say("Failed to parse the message, try something like `predict pIC50 for SMILES for target `") return + + # get the SMILES, target and property as parsed by the LLM + smiles = model.SMILES + target = model.biological_target + endpoint = model.property # llm found property better + if not _is_valid_smiles(smiles): say(f"Invalid SMILES {smiles}, unable to proceed") return + if not target in ASAPMLModelRegistry.get_targets_with_models(): say( f"Invalid target {target}, not in: {ASAPMLModelRegistry.get_targets_with_models()}; unable to proceed" ) return - # make prediction + + if not endpoint in ASAPMLModelRegistry.get_endpoints(): + say( + f"Invalid endpoint {endpoint}, not in: {ASAPMLModelRegistry.get_endpoints()}; unable to proceed" + ) + return + + smiles = _rdkit_smiles_roundtrip(smiles) - gs = GATInference.from_latest_by_target(target) + gs = GATInference.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint) pred = gs.predict_from_smiles(smiles) say( - f"Predicted pIC50 for {smiles} is {pred:.2f} using model {gs.model_name} :test_tube:" + f"Predicted {endpoint} for {smiles} is {pred:.2f} using model {gs.model_name} :test_tube:" ) # TODO make pred for every target if none specified @@ -662,7 +663,7 @@ def help(say, context, event, logger): say( "you asked for help or misspelt a command, I can help you with the following commands:" ) - say("* `@falcbot run FEC on series `") + # say("* `@falcbot run FEC on series `") say("* `@falcbot predict pIC50 for SMILES for target `") say("* `@falcbot predict pIC50 for structure for target `") say("* `@falcbot list valid targets`") diff --git a/falcbot/llm.py b/falcbot/llm.py new file mode 100644 index 0000000..c9ec092 --- /dev/null +++ b/falcbot/llm.py @@ -0,0 +1,95 @@ +from langchain_core.prompts import PromptTemplate +import os +from langchain_openai import ChatOpenAI +from asapdiscovery.ml.models import ASAPMLModelRegistry +from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags + +from .util import _is_valid_smiles + + + +from pydantic import BaseModel, Field, validator + +class ASAPMLModelQuery(BaseModel): + """ + Model that defines the smiles string, biological target and property of interest + """ + SMILES: str = Field(..., description="SMILES string of the query compound") + biological_target: str = Field(..., description="Biological target for the compound") + property: str = Field(..., description="Measured property for the compound") + + + # VALIDATE IN slack function to give feedback? + + # @validator("SMILES") + # @classmethod + # def validate_smiles(cls, v): + # if not _is_valid_smiles(v): + # raise ValueError("Invalid SMILES string") + # return v + + # @validator("biological_target") + # @classmethod + # def validate_target(cls, v): + # if v not in TargetTags.get_values(): + # raise ValueError("Invalid target") + # return v + + # @validator("property") + # @classmethod + # def validate_property(cls, v): + # if v not in ASAPMLModelRegistry.get_endpoints(): + # raise ValueError("Invalid property") + # return v + + +def _and_join(lst): + return " and ".join(lst) + +def _make_ml_prompt_template() -> PromptTemplate: + """ + Create a prompt template for the ASAPMLModelQuery model + """ + # join to make a string with "and" between each + targets_w_models = ASAPMLModelRegistry.get_targets_with_models() + target_str = _and_join(targets_w_models) + properties = _and_join(ASAPMLModelRegistry.get_endpoints()) + + return PromptTemplate( + template="You are an expert scientist, parse the following making sure all SMILES strings are represented exactly as in the input: Be very careful and use only SMILES already in the prompt. Allowed variables for target are {targets} and for property are {properties} : {query}", + input_variables=["query"], + partial_variables={"properties": properties, "targets": target_str} +) + +_ML_PROMPT_TEMPLATE = _make_ml_prompt_template() + + + + +class StructuredLLMQuery: + + def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate, openai_model="gpt-4o",): + """ + """ + self.openai_model = openai_model + self.pydantic_model = pydantic_model + self.prompt_template = prompt_template + # get openai api key + openai_api_key = os.getenv("OPENAI_API_KEY") + if openai_api_key is None: + raise ValueError("OPENAI_API_KEY environment variable is not set") + + llm = ChatOpenAI(model=self.openai_model) + + structured_llm = llm.with_structured_output(self.pydantic_model) + self.chain = prompt_template | structured_llm + + def query(self, query: str): + try: + parsed_model = self.chain.invoke({'query': query}) + return True, parsed_model + except Exception as e: + print(e) + return False, None + +_BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE) \ No newline at end of file diff --git a/falcbot/util.py b/falcbot/util.py new file mode 100644 index 0000000..0aedddc --- /dev/null +++ b/falcbot/util.py @@ -0,0 +1,13 @@ +from rdkit import Chem + +def _is_valid_smiles(smi): + m = Chem.MolFromSmiles(smi) + if m is None: + return False + else: + return True + + +def _rdkit_smiles_roundtrip(smi: str) -> str: + mol = Chem.MolFromSmiles(smi) + return Chem.MolToSmiles(mol) From 63b007917e9a7beefde88f4791266ef44a4dea14 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Wed, 18 Sep 2024 21:01:50 +1000 Subject: [PATCH 03/13] current_state --- devtools/conda-envs/falcbot.yaml | 7 +++++-- falcbot/falcbot.py | 12 ++++++------ falcbot/llm.py | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index c8707fd..9f0410b 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -20,6 +20,11 @@ dependencies: - pydantic<2.0a0 - biopython + # llm + + - langchain <0.0.267, + - langchain-core <0.0.267, + # ml - pytorch @@ -67,8 +72,6 @@ dependencies: - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml # llm - - langchain - - langchain_core - langchain_openai diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index f999d19..b19ff77 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -28,8 +28,8 @@ from asapdiscovery.ml.inference import GATInference, SchnetInference from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags from asapdiscovery.ml.models import ASAPMLModelRegistry -from .llm import _BASIC_ML_LLM -from .util import _rdkit_smiles_roundtrip, _is_valid_smiles +import llm +import util # from falcbot.sqlite_db import connect_sqlite_db, insert_series, create_series_table @@ -513,7 +513,7 @@ def pred_matcher(event, logger, context): def make_pic50_pred(event, say, context, logger): content = event.get("text") # parse with LLM - worked, model = _BASIC_ML_LLM.query(content) + worked, model = llm._BASIC_ML_LLM.query(content) if not worked: say("Failed to parse the message, try something like `predict pIC50 for SMILES for target `") return @@ -523,7 +523,7 @@ def make_pic50_pred(event, say, context, logger): target = model.biological_target endpoint = model.property # llm found property better - if not _is_valid_smiles(smiles): + if not util._is_valid_smiles(smiles): say(f"Invalid SMILES {smiles}, unable to proceed") return @@ -540,7 +540,7 @@ def make_pic50_pred(event, say, context, logger): return - smiles = _rdkit_smiles_roundtrip(smiles) + smiles = util._rdkit_smiles_roundtrip(smiles) gs = GATInference.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint) pred = gs.predict_from_smiles(smiles) say( @@ -663,7 +663,7 @@ def help(say, context, event, logger): say( "you asked for help or misspelt a command, I can help you with the following commands:" ) - # say("* `@falcbot run FEC on series `") + say("* `@falcbot run FEC on series `") say("* `@falcbot predict pIC50 for SMILES for target `") say("* `@falcbot predict pIC50 for structure for target `") say("* `@falcbot list valid targets`") diff --git a/falcbot/llm.py b/falcbot/llm.py index c9ec092..41262ec 100644 --- a/falcbot/llm.py +++ b/falcbot/llm.py @@ -4,7 +4,7 @@ from asapdiscovery.ml.models import ASAPMLModelRegistry from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags -from .util import _is_valid_smiles +import util From e952cd8e7d1071e493e8b6da9c672f6c07f0c855 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Wed, 18 Sep 2024 21:17:09 +1000 Subject: [PATCH 04/13] try llamaindex instead --- devtools/conda-envs/falcbot.yaml | 7 +------ falcbot/llm.py | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index 9f0410b..d7208b7 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -21,9 +21,7 @@ dependencies: - biopython # llm - - - langchain <0.0.267, - - langchain-core <0.0.267, + - llama-index # ml @@ -71,7 +69,4 @@ dependencies: - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml - # llm - - langchain_openai - diff --git a/falcbot/llm.py b/falcbot/llm.py index 41262ec..fa2565a 100644 --- a/falcbot/llm.py +++ b/falcbot/llm.py @@ -1,11 +1,10 @@ -from langchain_core.prompts import PromptTemplate import os -from langchain_openai import ChatOpenAI from asapdiscovery.ml.models import ASAPMLModelRegistry from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags import util - +from llama_index.core.program import LLMTextCompletionProgram +from llama_index.core import PromptTemplate from pydantic import BaseModel, Field, validator @@ -46,6 +45,8 @@ class ASAPMLModelQuery(BaseModel): def _and_join(lst): return " and ".join(lst) +_base_ml_prompt_template = "You are an expert scientist, parse the following making sure all SMILES strings are represented exactly as in the input: Be very careful and use only SMILES already in the prompt. Allowed variables for target are {targets} and for property are {properties} : {query}" + def _make_ml_prompt_template() -> PromptTemplate: """ Create a prompt template for the ASAPMLModelQuery model @@ -55,11 +56,7 @@ def _make_ml_prompt_template() -> PromptTemplate: target_str = _and_join(targets_w_models) properties = _and_join(ASAPMLModelRegistry.get_endpoints()) - return PromptTemplate( - template="You are an expert scientist, parse the following making sure all SMILES strings are represented exactly as in the input: Be very careful and use only SMILES already in the prompt. Allowed variables for target are {targets} and for property are {properties} : {query}", - input_variables=["query"], - partial_variables={"properties": properties, "targets": target_str} -) + return _base_ml_prompt_template.partial_format(targets=target_str, properties=properties) _ML_PROMPT_TEMPLATE = _make_ml_prompt_template() @@ -68,7 +65,7 @@ def _make_ml_prompt_template() -> PromptTemplate: class StructuredLLMQuery: - def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate, openai_model="gpt-4o",): + def __init__(self, pydantic_model: BaseModel, prompt_template: str, openai_model="gpt-4o",): """ """ self.openai_model = openai_model @@ -79,14 +76,15 @@ def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate, if openai_api_key is None: raise ValueError("OPENAI_API_KEY environment variable is not set") - llm = ChatOpenAI(model=self.openai_model) + self.program = LLMTextCompletionProgram.from_defaults( + output_cls=self.pydantic_model, + prompt_template_str=self.prompt_template, + verbose=True) - structured_llm = llm.with_structured_output(self.pydantic_model) - self.chain = prompt_template | structured_llm def query(self, query: str): try: - parsed_model = self.chain.invoke({'query': query}) + parsed_model = self.program(query=query) return True, parsed_model except Exception as e: print(e) From 8da74d1279bab9bf054ecf838863822b07608cca Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 17:14:22 +1000 Subject: [PATCH 05/13] working --- devtools/conda-envs/falcbot.yaml | 10 +++++----- falcbot/falcbot.py | 9 +++++---- falcbot/llm.py | 28 ++++++++++++++++++---------- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index d7208b7..8750a97 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -63,10 +63,10 @@ dependencies: # Pip-only installs - pip: - - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy - - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-data&subdirectory=asapdiscovery-data - - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling - - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking - - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml + - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy + - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-data&subdirectory=asapdiscovery-data + - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling + - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking + - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index b19ff77..5056dfd 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -515,7 +515,7 @@ def make_pic50_pred(event, say, context, logger): # parse with LLM worked, model = llm._BASIC_ML_LLM.query(content) if not worked: - say("Failed to parse the message, try something like `predict pIC50 for SMILES for target `") + say("Failed to parse the message, try something like `predict pIC50 for SMILES for pro for target `") return # get the SMILES, target and property as parsed by the LLM @@ -541,10 +541,11 @@ def make_pic50_pred(event, say, context, logger): smiles = util._rdkit_smiles_roundtrip(smiles) - gs = GATInference.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint) - pred = gs.predict_from_smiles(smiles) + model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint) + infr = GATInference.from_ml_model_spec(model) + pred = infr.predict_from_smiles(smiles) say( - f"Predicted {endpoint} for {smiles} is {pred:.2f} using model {gs.model_name} :test_tube:" + f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" ) # TODO make pred for every target if none specified diff --git a/falcbot/llm.py b/falcbot/llm.py index fa2565a..0bd14a3 100644 --- a/falcbot/llm.py +++ b/falcbot/llm.py @@ -5,6 +5,7 @@ import util from llama_index.core.program import LLMTextCompletionProgram from llama_index.core import PromptTemplate +from llama_index.llms.openai import OpenAI from pydantic import BaseModel, Field, validator @@ -53,10 +54,13 @@ def _make_ml_prompt_template() -> PromptTemplate: """ # join to make a string with "and" between each targets_w_models = ASAPMLModelRegistry.get_targets_with_models() + # filter out None values + targets_w_models = [t for t in targets_w_models if t is not None] target_str = _and_join(targets_w_models) properties = _and_join(ASAPMLModelRegistry.get_endpoints()) - - return _base_ml_prompt_template.partial_format(targets=target_str, properties=properties) + pt = PromptTemplate(_base_ml_prompt_template) + formatted = pt.partial_format(targets=target_str, properties=properties) + return formatted _ML_PROMPT_TEMPLATE = _make_ml_prompt_template() @@ -65,7 +69,7 @@ def _make_ml_prompt_template() -> PromptTemplate: class StructuredLLMQuery: - def __init__(self, pydantic_model: BaseModel, prompt_template: str, openai_model="gpt-4o",): + def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate, openai_model="gpt-4o",): """ """ self.openai_model = openai_model @@ -76,18 +80,22 @@ def __init__(self, pydantic_model: BaseModel, prompt_template: str, openai_mode if openai_api_key is None: raise ValueError("OPENAI_API_KEY environment variable is not set") + llm = OpenAI(model=self.openai_model) + self.program = LLMTextCompletionProgram.from_defaults( output_cls=self.pydantic_model, - prompt_template_str=self.prompt_template, + prompt=self.prompt_template, + llm=llm, verbose=True) def query(self, query: str): - try: - parsed_model = self.program(query=query) - return True, parsed_model - except Exception as e: - print(e) - return False, None + # try: + parsed_model = self.program(query=query) + return True, parsed_model + + # except Exception as e: + # print(e) + # return False, None _BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE) \ No newline at end of file From d66bf5c9bf1b0a70951f227e9add3549778f6dcb Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 17:21:51 +1000 Subject: [PATCH 06/13] endpoint matcher --- falcbot/falcbot.py | 358 +++------------------------------------------ 1 file changed, 21 insertions(+), 337 deletions(-) diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index 5056dfd..50ec1a3 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -167,337 +167,6 @@ def are_you_alive(event, say, context, logger): say(f"yes im alive!") -def query_all_networks_matcher(event, logger, context): - # regex for any instance of help, case insensitive with optional spaces - msg = event.get("text", None) - if not msg: - return False - pattern = r"(?i)query all networks" - match = re.search(pattern, msg) - return match - - -@app.event("app_mention", matchers=[query_all_networks_matcher]) -def query_all_networks(event, say, context, logger): - logger.debug("Querying all networks") - client = AlchemiscaleHelper() - scope_status_dict = client._client.get_scope_status(visualize=False) - for k, v in scope_status_dict.items(): - say(f"Status {k} has count {v}") - - say("________________________________") - say("Checking for running networks...") - - running_networks = client._client.query_networks() - - if not running_networks: - say("No networks are running currently") - return - - networks_status = client._client.get_networks_status(running_networks) - networks_actioned_tasks = client._client.get_networks_actioned_tasks( - running_networks - ) - - for key, network_status, actioned_tasks in zip( - running_networks, networks_status, networks_actioned_tasks - ): - if ( - "running" in network_status or "waiting" in network_status - ) and actioned_tasks: - say(f"Network {key} has following status breakdown") - state_breakdown = "" - for state in _status_keys: - state_breakdown += f"{state}: {network_status.get(state, 0)} " - say(state_breakdown) - say("________________________________") - say("Done :smile:") - - -def run_fec_matcher(event, logger, context): - # regex for any instance of help, case insensitive with optional spaces - msg = event.get("text", None) - if not msg: - return False - pattern = r"(?i)run FEC" - match = re.search(pattern, msg) - return match - - -@app.event("app_mention", matchers=[run_fec_matcher]) -def run_fec(event, say, context, logger): - logger.info("Planning and submitting from postera") - say( - "Preparing your calculation, please wait this may take a while, ... :ghost: :ghost: :ghost:" - ) - content = event.get("text") - # parse message for molset using regex - pattern = r"on series\s+.*?(\b[^\s]+\b)+" - match = re.search(pattern, content) - if match: - postera_molset_name = match.group(1) - logger.info(f"Postera molecule set name is {postera_molset_name}") - else: - say( - "Could not find postera molecule set name in the message, unable to proceed" - ) - return - - campaign = "confidential" - - # check for attatched file - files = event.get("files") - if not files: - logger.info("No file attatched, unable to proceed") - say("No receptor file attatched, unable to proceed") - return - else: - if len(files) > 1: - logger.info("More than one file attatched, unable to proceed") - say("More than one file attatched, unable to proceed") - return - # get the first file - file = files[0] - # check if it is a pdb file - file_extn = file.get("title").split(".")[-1] - if file_extn != "pdb": - say("Attatched file is not a pdb file, unable to proceed") - return - - # load ligands from postera - try: - input_ligands = PosteraFactory(molecule_set_name=postera_molset_name).pull() - except Exception as e: - say(f"Failed to pull ligands from postera with error: {e}") - return - - say( - f"Input series has {len(input_ligands)} ligands, this may take a while to process. I'll let you know once its running. Please be patient :ghost: :ghost: :ghost:" - ) - fixed_ligands = [] - # add hydrogens to ligands - for ligand in input_ligands: - mol = ligand.to_oemol() - oechem.OEAddExplicitHydrogens(mol) - fixed_ligands.append(Ligand.from_oemol(mol)) - input_ligands = fixed_ligands - # create dataset name - dataset_name = postera_molset_name.replace("-", "_") + "_" + "FALCBot" - project = dataset_name - - # run prep workflow - logger.info("Running prep workflow") - - prep_factory = AlchemyPrepWorkflow() - - # load receptor from attatched file - # read into temp file - # TODO move to pre-prepped PDBs hosted on the cloud instance and pull from there - try: - with NamedTemporaryFile(suffix=".pdb") as temp: - logger.info(f"file: {file.get('url_private_download')}") - _download_slack_file(file.get("url_private_download"), temp.name) - ref_complex = Complex.from_pdb( - temp.name, - target_kwargs={"target_name": f"{dataset_name}_receptor"}, - ligand_kwargs={"compound_name": f"{dataset_name}_receptor_ligand"}, - ) - except Exception as e: - say(f"Failed to load receptor from attatched file with error: {e}") - return - # prep the complex - logger.info("Prepping complex") - prepped_ref_complex = PreppedComplex.from_complex(ref_complex) - - import time - - logger.info("Creating alchemy dataset") - processors = cpu_count() - 1 - logger.info(f"Using {processors} processors") - start_time = time.time() - alchemy_dataset = prep_factory.create_alchemy_dataset( - dataset_name=dataset_name, - ligands=input_ligands, - reference_complex=prepped_ref_complex, - processors=processors, - ) - end_time = time.time() - execution_time = end_time - start_time - logger.info(f"Time taken to create alchemy dataset: {execution_time} seconds") - - # check for failed ligands - logger.info("Checking for failed ligands") - if alchemy_dataset.failed_ligands: - fails = sum([len(values) for values in alchemy_dataset.failed_ligands.values()]) - say(f"Failed to prep {fails} ligands") - # add more detail - - # we have our working ligands - posed_ligands = alchemy_dataset.posed_ligands - - # ok now onto actual network creation - logger.info("Creating factory and planned network") - factory = FreeEnergyCalculationFactory() - - # create receptor - # write to a temp pdb file and read back in - with NamedTemporaryFile(suffix=".pdb") as fp: - alchemy_dataset.reference_complex.target.to_pdb_file(fp.name) - receptor = ProteinComponent.from_pdb_file(fp.name) - - # create factory - logger.info("Planning network with factory and planned network") - planned_network = factory.create_fec_dataset( - dataset_name=dataset_name, - receptor=receptor, - ligands=posed_ligands, - central_ligand=None, - experimental_protocol=None, - ) - - # we want to return links to the factory and planned network - # we do this through artifacts in a cloudfront exposed bucket - cf = CloudFront.from_settings(CloudfrontSettings()) - s3 = S3.from_settings(S3Settings()) - - # push factory to cloudfront exposed bucket - factory_fname = f"fec_factory-{dataset_name}.json" - factory_bucket_path = f"alchemy/{dataset_name}/{factory_fname}" - with NamedTemporaryFile() as temp: - factory.to_file(filename=temp.name) - factory_cf_url = _push_to_s3_with_cloudfront( - s3, cf, factory_bucket_path, temp.name, content_type="application/json" - ) - - planned_network_fname = f"planned_network-{dataset_name}.json" - planned_network_bucket_path = f"alchemy/{dataset_name}/{planned_network_fname}" - # push planned network to cloudfront exposed bucket - with NamedTemporaryFile() as temp: - planned_network.to_file(filename=temp.name) - planned_network_cf_url = _push_to_s3_with_cloudfront( - s3, - cf, - planned_network_bucket_path, - temp.name, - content_type="application/json", - ) - - ligands_fname = f"ligands-{dataset_name}.sdf" - ligands_fname_bucket_path = f"alchemy/{dataset_name}/{ligands_fname}" - # push planned network to cloudfront exposed bucket - with NamedTemporaryFile(suffix=".sdf") as temp: - alchemy_dataset.save_posed_ligands(temp.name) - ligand_cf_url = _push_to_s3_with_cloudfront( - s3, - cf, - ligands_fname_bucket_path, - temp.name, - content_type="text/plain", - ) - - receptor_fname = f"receptor-{dataset_name}.pdb" - receptor_fname_bucket_path = f"alchemy/{dataset_name}/{receptor_fname}" - # push planned network to cloudfront exposed bucket - with NamedTemporaryFile(suffix=".pdb") as temp: - alchemy_dataset.reference_complex.target.to_pdb_file(temp.name) - receptor_cf_url = _push_to_s3_with_cloudfront( - s3, - cf, - receptor_fname_bucket_path, - temp.name, - content_type="text/plain", - ) - - logger.info(f"Data set name: {dataset_name}") - logger.info(f"Factory url: {factory_cf_url}") - logger.info(f"Planned network url: {planned_network_cf_url}") - logger.info(f"Ligands url: {ligand_cf_url}") - logger.info(f"Receptor url: {receptor_cf_url}") - - # submit the network - client = AlchemiscaleHelper() - - network_scope = Scope(org="asap", campaign=campaign, project=project) - submitted_network = client.create_network( - planned_network=planned_network, scope=network_scope - ) - task_ids = client.action_network( - planned_network=submitted_network, prioritize=False - ) - logger.debug( - f"Submitted network {submitted_network.results.network_key} with task ids {task_ids} to campaign {campaign} and project {project}." - ) - # except Exception as e: - # say(f"Failed to submit network with error: {e}") - # return - - insert_series( - db_connection, - dataset_name, - factory_cf_url, - planned_network_cf_url, - ligand_cf_url, - receptor_cf_url, - ) - - say( - f"Simulations are running! :rocket: :rocket: :rocket: Your project name is: {project}, to debug use `@falcbot debug series {dataset_name}`" - ) - - -def debug_series_matcher(event, logger, context): - # regex for any instance of help, case insensitive with optional spaces - msg = event.get("text", None) - if not msg: - return False - pattern = r"(?i)debug series" - match = re.search(pattern, msg) - return match - - -@app.event("app_mention", matchers=[debug_series_matcher]) -def debug_series(event, say, context, logger): - message = event.get("text") - pattern = r"series\s+.*?(\b[^\s]+\b)+" - match = re.search(pattern, message) - if match: - series_name = match.group(1) - logger.info(f"Series name is {series_name}") - else: - say("Could not find series name in the message, unable to proceed") - return - - # query the database - series = query_series_by_name(db_connection, series_name) - if not series: - say(f"Series {series_name} not found in the database, unable to proceed") - return - say(f"Series {series_name} found with values: {series}") - - ligand_cf_url = series[4] - receptor_cf_url = series[5] - factory_cf_url = series[2] - planned_network_cf_url = series[3] - - # make block data from the links - block_data = [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "Links to your debugging info :pill: :pill: :pill:", - }, - }, - _link_to_block_data(ligand_cf_url, "Ligand SDF file"), - _link_to_block_data(receptor_cf_url, "Receptor PDB file"), - _link_to_block_data(factory_cf_url, "FECFactory JSON"), - _link_to_block_data(planned_network_cf_url, "PlannedNetwork JSON"), - ] - - say("Links to your debugging info:", blocks=block_data) - - return - def pred_matcher(event, logger, context): # regex for any instance of help, case insensitive with optional spaces @@ -515,7 +184,7 @@ def make_pic50_pred(event, say, context, logger): # parse with LLM worked, model = llm._BASIC_ML_LLM.query(content) if not worked: - say("Failed to parse the message, try something like `predict pIC50 for SMILES for pro for target `") + say("Failed to parse the message, try something like `I would like to predict pIC50 for compound CCCC for MERS`") return # get the SMILES, target and property as parsed by the LLM @@ -640,6 +309,22 @@ def list_all_targets(say, context, logger): return +def list_endpoints_matcher(event, logger, context): + # regex for any instance of help, case insensitive with optional spaces + msg = event.get("text", None) + if not event: + return False + pattern = r"(?i)list valid endpoints" + match = re.search(pattern, msg) + return match + + +@app.event("app_mention", matchers=[list_targets_matcher]) +def list_endpoints(say, context, logger): + say(f"Endpoints: {ASAPMLModelRegistry.get_endpoints()}") + return + + def help_matcher(event, logger, context): # regex for any instance of help, case insensitive with optional spaces msg = event.get("text", None) @@ -650,6 +335,7 @@ def help_matcher(event, logger, context): return match + @app.event("app_mention", matchers=[help_matcher]) def help_with_msg(say, context, event, logger): help(say, context, event, logger) @@ -662,14 +348,12 @@ def help_on_mention(say, context, event, logger): def help(say, context, event, logger): say( - "you asked for help or misspelt a command, I can help you with the following commands:" + "you asked for help or misspelt a command, I can help you with the following commands:\n" ) - say("* `@falcbot run FEC on series `") - say("* `@falcbot predict pIC50 for SMILES for target `") + say("* `@falcbot predict for compound for `") say("* `@falcbot predict pIC50 for structure for target `") say("* `@falcbot list valid targets`") - say("* `@falcbot query all networks`") - say("* `@falcbot debug series `") + say("* `@falcbot list valid endpoints`") say("* `@falcbot are you alive`") say("* `@falcbot help`") From 5c7d2fe52fadb129e47868975c942e4a9993c4be Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:04:19 +1000 Subject: [PATCH 07/13] working? --- falcbot/falcbot.py | 106 +++++++++++---------------------------------- falcbot/llm.py | 49 ++++++++------------- 2 files changed, 45 insertions(+), 110 deletions(-) diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index 50ec1a3..01f47b3 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -173,9 +173,11 @@ def pred_matcher(event, logger, context): msg = event.get("text", None) if not msg: return False - pattern = r"(?i)predict" - match = re.search(pattern, msg) - return match + worked, model = llm._IS_ML_QUERY_LLM.query(msg) + if not worked: + return False + if worked: + return model.value @app.event("app_mention", matchers=[pred_matcher]) @@ -192,6 +194,7 @@ def make_pic50_pred(event, say, context, logger): target = model.biological_target endpoint = model.property # llm found property better + if not util._is_valid_smiles(smiles): say(f"Invalid SMILES {smiles}, unable to proceed") return @@ -208,89 +211,30 @@ def make_pic50_pred(event, say, context, logger): ) return + _global_model = False + + if not ASAPMLModelRegistry.endpoint_has_target(endpoint): + _target = None + _global_model = True + else: + _target = target + + smiles = util._rdkit_smiles_roundtrip(smiles) - model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint) + model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(_target, "GAT", endpoint) + if model is None: + say(f"No model found for {target} {endpoint}") + return infr = GATInference.from_ml_model_spec(model) pred = infr.predict_from_smiles(smiles) say( - f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "") ) - # TODO make pred for every target if none specified + return -def make_structural_pred_matcher(event, logger, context): - # regex for any instance of help, case insensitive with optional spaces - msg = event.get("text", None) - if not msg: - return False - pattern = r"(?i)predict pIC50 for structure" - match = re.search(pattern, msg) - return match - - -@app.event("app_mention", matchers=[make_structural_pred_matcher]) -def make_structural_pred(event, say, context, logger): - content = event.get("text") - # parse message for molset using regex - pattern = r"(?i)\s+for\s+target\s+(.+)" - match = re.search(pattern, content) - if match: - target = match.group(1) - else: - say("Could not find Target in the message, unable to proceed") - return - - allowed_targets = list( - set(ASAPMLModelRegistry.get_targets_with_models()) - {"SARS-CoV-2-Mac1"} - ) # remove SARS-CoV-2-Mac1, currently not supported - - if not target in allowed_targets: - say(f"Invalid target {target}, not in: {allowed_targets}; unable to proceed") - return - - # check for attatched file - files = event.get("files") - if not files: - logger.info("No file attatched, unable to proceed") - say("No pdb file attatched, unable to proceed") - return - else: - if len(files) > 1: - logger.info("More than one file attatched, unable to proceed") - say("More than one file attatched, unable to proceed") - return - # get the first file - file = files[0] - title = file.get("title") - # check if it is a pdb file - file_extn = file.get("title").split(".")[-1] - if file_extn != "pdb": - say("Attatched file is not a pdb file, unable to proceed") - return - - try: - with NamedTemporaryFile(suffix=".pdb") as temp: - logger.info(f"file: {file.get('url_private_download')}") - _download_slack_file(file.get("url_private_download"), temp.name) - ref_complex = Complex.from_pdb( - temp.name, - target_kwargs={"target_name": f"receptor"}, - ligand_kwargs={"compound_name": f"receptor_ligand"}, - ) - except Exception as e: - say(f"Failed to load receptor from attatched file with error: {e}") - return - - # make prediction - si = SchnetInference.from_latest_by_target(target) - pred = si.predict_from_oemol(ref_complex.to_combined_oemol()) - say( - f"Predicted pIC50 for {title} is {pred:.2f} using model {si.model_name} :test_tube:" - ) - - # TODO make pred for every target if none specified def list_targets_matcher(event, logger, context): @@ -305,7 +249,10 @@ def list_targets_matcher(event, logger, context): @app.event("app_mention", matchers=[list_targets_matcher]) def list_all_targets(say, context, logger): - say(f"Targets: {ASAPMLModelRegistry.get_targets_with_models()}") + targets = ASAPMLModelRegistry.get_targets_with_models() + # filter out None values + targets = [t for t in targets if t is not None] + say(f"Targets: {targets}") return @@ -319,7 +266,7 @@ def list_endpoints_matcher(event, logger, context): return match -@app.event("app_mention", matchers=[list_targets_matcher]) +@app.event("app_mention", matchers=[list_endpoints_matcher]) def list_endpoints(say, context, logger): say(f"Endpoints: {ASAPMLModelRegistry.get_endpoints()}") return @@ -351,7 +298,6 @@ def help(say, context, event, logger): "you asked for help or misspelt a command, I can help you with the following commands:\n" ) say("* `@falcbot predict for compound for `") - say("* `@falcbot predict pIC50 for structure for target `") say("* `@falcbot list valid targets`") say("* `@falcbot list valid endpoints`") say("* `@falcbot are you alive`") diff --git a/falcbot/llm.py b/falcbot/llm.py index 0bd14a3..38657da 100644 --- a/falcbot/llm.py +++ b/falcbot/llm.py @@ -19,29 +19,12 @@ class ASAPMLModelQuery(BaseModel): property: str = Field(..., description="Measured property for the compound") - # VALIDATE IN slack function to give feedback? - - # @validator("SMILES") - # @classmethod - # def validate_smiles(cls, v): - # if not _is_valid_smiles(v): - # raise ValueError("Invalid SMILES string") - # return v - - # @validator("biological_target") - # @classmethod - # def validate_target(cls, v): - # if v not in TargetTags.get_values(): - # raise ValueError("Invalid target") - # return v - - # @validator("property") - # @classmethod - # def validate_property(cls, v): - # if v not in ASAPMLModelRegistry.get_endpoints(): - # raise ValueError("Invalid property") - # return v - +class IsMLQuery(BaseModel): + """ + Model that checks if a query is a machine learning query + """ + value: bool = Field(..., description="Boolean value indicating if the query is a machine learning query") + def _and_join(lst): return " and ".join(lst) @@ -66,6 +49,10 @@ def _make_ml_prompt_template() -> PromptTemplate: +_base_is_query_prompt_template = "You are an expert scientist, parse the following and determine if it is a request for a prediction from a machine learning model, look for words like predict, : {query}" +_IS_ML_QUERY_PROMPT_TEMPLATE = PromptTemplate(_base_is_query_prompt_template) + + class StructuredLLMQuery: @@ -90,12 +77,14 @@ def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate, def query(self, query: str): - # try: - parsed_model = self.program(query=query) - return True, parsed_model + try: + parsed_model = self.program(query=query) + return True, parsed_model - # except Exception as e: - # print(e) - # return False, None + except Exception as e: + print(e) + return False, None + +_BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE) -_BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE) \ No newline at end of file +_IS_ML_QUERY_LLM = StructuredLLMQuery(IsMLQuery, _IS_ML_QUERY_PROMPT_TEMPLATE) \ No newline at end of file From 2b122af0f91bc6b271d9bf879ebce0e5e2225013 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:07:06 +1000 Subject: [PATCH 08/13] semi working --- devtools/conda-envs/falcbot.yaml | 10 ++-- falcbot/falcbot.py | 96 -------------------------------- 2 files changed, 5 insertions(+), 101 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index 8750a97..d7208b7 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -63,10 +63,10 @@ dependencies: # Pip-only installs - pip: - - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy - - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-data&subdirectory=asapdiscovery-data - - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling - - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking - - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-data&subdirectory=asapdiscovery-data + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking + - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index 01f47b3..9f8bade 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -51,104 +51,8 @@ class SlackSettings(BaseSettings): ) -def connect_sqlite_db(path, check_same_thread=False): - connection = None - try: - connection = sqlite3.connect(path, check_same_thread=check_same_thread) - print("Connection to SQLite DB successful") - except sqlite3.Error as e: - print(f"The error '{e}' occurred") - - return connection - - -def execute_query(connection, query): - cursor = connection.cursor() - try: - cursor.execute(query) - connection.commit() - print("Query executed successfully") - except sqlite3.Error as e: - print(f"The error '{e}' occurred") - - -def create_series_table(connection): - create_series_table_query = """ - CREATE TABLE IF NOT EXISTS series ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - name TEXT NOT NULL UNIQUE, - factory_url TEXT NOT NULL, - planned_network_url TEXT NOT NULL, - ligands_url TEXT NOT NULL, - receptor_url TEXT NOT NULL - - ); - """ - execute_query(connection, create_series_table_query) - - -def insert_series( - connection, name, factory_url, planned_network_url, ligands_url, receptor_url -): - insert_series_query = f""" - INSERT INTO series (name, factory_url, planned_network_url, ligands_url, receptor_url) - VALUES ('{name}', '{factory_url}', '{planned_network_url}', '{ligands_url}', '{receptor_url}'); - """ - execute_query(connection, insert_series_query) - - -def query_series_by_name(connection, name): - query = f"SELECT * FROM series WHERE name='{name}'" - cursor = connection.cursor() - cursor.execute(query) - # unpack into a dictionary - series = cursor.fetchone() - print(series) - return series - - settings = SlackSettings() app = App(token=settings.SLACK_BOT_TOKEN) -db_connection = connect_sqlite_db("falcbot.sqlite3", check_same_thread=False) -create_series_table(db_connection) - - -_status_keys = ["complete", "running", "waiting", "error", "invalid", "deleted"] - - -def _download_slack_file(file_url, file_name): - import requests - - headers = {"Authorization": f"Bearer {settings.SLACK_BOT_TOKEN}"} - - response = requests.get(file_url, headers=headers, stream=True) - response.raise_for_status() - with open(file_name, "wb") as f: - for chunk in response.iter_content(chunk_size=2048): - f.write(chunk) - - -def _push_to_s3_with_cloudfront( - s3_instance: S3, - cloudfront_instance: CloudFront, - bucket_path: str, - file_path: str, - expires_delta: timedelta = timedelta(days=365 * 5), - content_type: str = "application/json", -) -> str: - # push to s3 - s3_instance.push_file(file_path, location=bucket_path, content_type=content_type) - # generate cloudfront url - expiry = datetime.utcnow() + expires_delta - return cloudfront_instance.generate_signed_url(bucket_path, expiry) - - -def _link_to_block_data(link, text): - return { - "type": "section", - "text": {"type": "mrkdwn", "text": f"<{link}|{text}>"}, - } - From 851f187317c6eb9038416957379ca2abfb782933 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:20:35 +1000 Subject: [PATCH 09/13] simplify str --- falcbot/falcbot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index 9f8bade..c3e6b3d 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -120,8 +120,10 @@ def make_pic50_pred(event, say, context, logger): if not ASAPMLModelRegistry.endpoint_has_target(endpoint): _target = None _global_model = True + _target_str = "global" else: _target = target + _target_str = target @@ -133,7 +135,7 @@ def make_pic50_pred(event, say, context, logger): infr = GATInference.from_ml_model_spec(model) pred = infr.predict_from_smiles(smiles) say( - f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "") + f"Predicted {_target_str} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "") ) # TODO make pred for every target if none specified From 97761c8384bcc31e057c118064e1e35f293e9a41 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:23:12 +1000 Subject: [PATCH 10/13] cleanup --- falcbot/falcbot.py | 29 +---------------------------- falcbot/llm.py | 2 -- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index c3e6b3d..3b2a792 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -1,42 +1,15 @@ import logging import re -import uuid import logging -from datetime import datetime, timedelta -from tempfile import NamedTemporaryFile from pydantic import BaseSettings, Field from slack_bolt import App from slack_bolt.adapter.socket_mode import SocketModeHandler - -from alchemiscale import Scope -from openfe import ProteinComponent -from asapdiscovery.alchemy.schema.fec import ( - FreeEnergyCalculationFactory, - AlchemiscaleSettings, -) -from asapdiscovery.alchemy.schema.prep_workflow import AlchemyPrepWorkflow -from asapdiscovery.alchemy.utils import AlchemiscaleHelper - -from asapdiscovery.data.schema.complex import Complex, PreppedComplex -from asapdiscovery.data.schema.ligand import Ligand -from asapdiscovery.data.backend.openeye import oechem -from asapdiscovery.data.services.postera.postera_factory import PosteraFactory -from asapdiscovery.data.services.services_config import CloudfrontSettings, S3Settings -from asapdiscovery.data.services.aws.cloudfront import CloudFront -from asapdiscovery.data.services.aws.s3 import S3 - -from asapdiscovery.ml.inference import GATInference, SchnetInference -from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags +from asapdiscovery.ml.inference import GATInference from asapdiscovery.ml.models import ASAPMLModelRegistry import llm import util -# from falcbot.sqlite_db import connect_sqlite_db, insert_series, create_series_table - -from rdkit import Chem -import sqlite3 -from multiprocessing import cpu_count # logger in a global context logging.basicConfig(level=logging.DEBUG) diff --git a/falcbot/llm.py b/falcbot/llm.py index 38657da..f7e768b 100644 --- a/falcbot/llm.py +++ b/falcbot/llm.py @@ -1,8 +1,6 @@ import os from asapdiscovery.ml.models import ASAPMLModelRegistry -from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags -import util from llama_index.core.program import LLMTextCompletionProgram from llama_index.core import PromptTemplate from llama_index.llms.openai import OpenAI From 88d947f9a3bfeefa32f2840d367e6b737b8d2434 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:27:21 +1000 Subject: [PATCH 11/13] remove alchemy deps --- devtools/conda-envs/falcbot.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index d7208b7..7b90489 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -41,17 +41,6 @@ dependencies: - semver - # alchemy - - numpy - - gufe =>0.9.5 - - httpx - - perses >=0.10.2 - - kartograf - - rich - - alchemiscale-client - - cinnabar >=0.4.1 - - openeye-toolkits - - openfe # other asapdiscovery deps - distributed From 3763d73e458c3a0bc67f2792e185205a9fc754d8 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:31:45 +1000 Subject: [PATCH 12/13] re-add openeye --- devtools/conda-envs/falcbot.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml index 7b90489..959304d 100644 --- a/devtools/conda-envs/falcbot.yaml +++ b/devtools/conda-envs/falcbot.yaml @@ -39,8 +39,7 @@ dependencies: - mtenn >=0.5.1 - wandb - semver - - + - openeye-toolkits # other asapdiscovery deps - distributed From c5ddc634dc974399650359dc5fd405a61d22875c Mon Sep 17 00:00:00 2001 From: hmacdope Date: Thu, 19 Sep 2024 19:46:45 +1000 Subject: [PATCH 13/13] add in errors --- falcbot/falcbot.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py index 3b2a792..cda4942 100644 --- a/falcbot/falcbot.py +++ b/falcbot/falcbot.py @@ -2,6 +2,7 @@ import re import logging from pydantic import BaseSettings, Field +import numpy as np from slack_bolt import App from slack_bolt.adapter.socket_mode import SocketModeHandler from asapdiscovery.ml.inference import GATInference @@ -106,9 +107,13 @@ def make_pic50_pred(event, say, context, logger): say(f"No model found for {target} {endpoint}") return infr = GATInference.from_ml_model_spec(model) - pred = infr.predict_from_smiles(smiles) + pred, err = infr.predict_from_smiles(smiles, return_err=True) + if np.isnan(err): + errstr = " " + else: + errstr = f" ± {err:.2f} " say( - f"Predicted {_target_str} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "") + f"Predicted {_target_str} {endpoint} for {smiles} is {pred:.2f}{errstr}using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "") ) # TODO make pred for every target if none specified