From fb31f3c502c5386669461eee772d7c40ae318e3d Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Tue, 17 Sep 2024 13:07:23 +1000
Subject: [PATCH 01/13] fex env

---
 devtools/conda-envs/falcbot.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index 0ebe881..de4de99 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -48,6 +48,7 @@ dependencies:
   - alchemiscale-client
   - cinnabar >=0.4.1
   - openeye-toolkits
+  - openfe
 
   # other asapdiscovery deps
   - distributed
@@ -59,11 +60,11 @@ dependencies:
 
     # Pip-only installs
   - pip:
-    - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy
-    - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-data&subdirectory=asapdiscovery-data
-    - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling
-    - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
-    - git+https://github.com/choderalab/asapdiscovery@cdd_download#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-data&subdirectory=asapdiscovery-data
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
 
 
 

From 20b8ccbb3e3238365417e4d9f6659bb9e5ba9be5 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Wed, 18 Sep 2024 20:31:44 +1000
Subject: [PATCH 02/13] start of llm

---
 devtools/conda-envs/falcbot.yaml |  4 ++
 falcbot/falcbot.py               | 53 +++++++++---------
 falcbot/llm.py                   | 95 ++++++++++++++++++++++++++++++++
 falcbot/util.py                  | 13 +++++
 4 files changed, 139 insertions(+), 26 deletions(-)
 create mode 100644 falcbot/llm.py
 create mode 100644 falcbot/util.py

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index de4de99..c8707fd 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -66,5 +66,9 @@ dependencies:
     - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
     - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
 
+      # llm
+    - langchain
+    - langchain_core
+    - langchain_openai
 
 
diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index 471c2f3..f999d19 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -28,6 +28,8 @@
 from asapdiscovery.ml.inference import GATInference, SchnetInference
 from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
 from asapdiscovery.ml.models import ASAPMLModelRegistry
+from .llm import _BASIC_ML_LLM
+from .util import _rdkit_smiles_roundtrip, _is_valid_smiles
 
 # from falcbot.sqlite_db import connect_sqlite_db, insert_series, create_series_table
 
@@ -148,17 +150,6 @@ def _link_to_block_data(link, text):
     }
 
 
-def _is_valid_smiles(smi):
-    m = Chem.MolFromSmiles(smi)
-    if m is None:
-        return False
-    else:
-        return True
-
-
-def _rdkit_smiles_roundtrip(smi: str) -> str:
-    mol = Chem.MolFromSmiles(smi)
-    return Chem.MolToSmiles(mol)
 
 
 def are_you_alive_matcher(event, logger, context):
@@ -508,42 +499,52 @@ def debug_series(event, say, context, logger):
     return
 
 
-def make_pic50_pred_matcher(event, logger, context):
+def pred_matcher(event, logger, context):
     # regex for any instance of help, case insensitive with optional spaces
     msg = event.get("text", None)
     if not msg:
         return False
-    pattern = r"(?i)predict pIC50 for SMILES"
+    pattern = r"(?i)predict"
     match = re.search(pattern, msg)
     return match
 
 
-@app.event("app_mention", matchers=[make_pic50_pred_matcher])
+@app.event("app_mention", matchers=[pred_matcher])
 def make_pic50_pred(event, say, context, logger):
     content = event.get("text")
-    # parse message for molset using regex
-    pattern = r"(?i)SMILES\s+(.+?)\s+for\s+target\s+(.+)"
-    match = re.search(pattern, content)
-    if match:
-        smiles = match.group(1)
-        target = match.group(2)
-    else:
-        say("Could not find SMILES and Target in the message, unable to proceed")
+    # parse with LLM
+    worked, model = _BASIC_ML_LLM.query(content)
+    if not worked:
+        say("Failed to parse the message, try something like `predict pIC50 for SMILES <smiles> for target <target>`")
         return
+    
+    # get the SMILES, target and property as parsed by the LLM
+    smiles = model.SMILES
+    target = model.biological_target
+    endpoint = model.property # llm found property better
+
     if not _is_valid_smiles(smiles):
         say(f"Invalid SMILES {smiles}, unable to proceed")
         return
+    
     if not target in ASAPMLModelRegistry.get_targets_with_models():
         say(
             f"Invalid target {target}, not in: {ASAPMLModelRegistry.get_targets_with_models()}; unable to proceed"
         )
         return
-    # make prediction
+    
+    if not endpoint in ASAPMLModelRegistry.get_endpoints():
+        say(
+            f"Invalid endpoint {endpoint}, not in: {ASAPMLModelRegistry.get_endpoints()}; unable to proceed"
+        )
+        return
+    
+    
     smiles = _rdkit_smiles_roundtrip(smiles)
-    gs = GATInference.from_latest_by_target(target)
+    gs = GATInference.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint)
     pred = gs.predict_from_smiles(smiles)
     say(
-        f"Predicted pIC50 for {smiles} is {pred:.2f} using model {gs.model_name} :test_tube:"
+        f"Predicted {endpoint} for {smiles} is {pred:.2f} using model {gs.model_name} :test_tube:"
     )
 
     # TODO make pred for every target if none specified
@@ -662,7 +663,7 @@ def help(say, context, event, logger):
     say(
         "you asked for help or misspelt a command, I can help you with the following commands:"
     )
-    say("* `@falcbot run FEC on series <series_name>`")
+    # say("* `@falcbot run FEC on series <series_name>`")
     say("* `@falcbot predict pIC50 for SMILES <smiles> for target <target>`")
     say("* `@falcbot predict pIC50 for structure for target <target>`")
     say("* `@falcbot list valid targets`")
diff --git a/falcbot/llm.py b/falcbot/llm.py
new file mode 100644
index 0000000..c9ec092
--- /dev/null
+++ b/falcbot/llm.py
@@ -0,0 +1,95 @@
+from langchain_core.prompts import PromptTemplate
+import os
+from langchain_openai import ChatOpenAI
+from asapdiscovery.ml.models import ASAPMLModelRegistry
+from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
+
+from  .util import _is_valid_smiles
+
+
+
+from pydantic import BaseModel, Field, validator
+
+class ASAPMLModelQuery(BaseModel):
+    """
+    Model that defines the smiles string, biological target and property of interest
+    """
+    SMILES: str = Field(..., description="SMILES string of the query compound")
+    biological_target: str = Field(..., description="Biological target for the compound")
+    property: str = Field(..., description="Measured property for the compound")
+
+
+    # VALIDATE IN slack function to give feedback?
+
+    # @validator("SMILES")
+    # @classmethod
+    # def validate_smiles(cls, v):
+    #     if not _is_valid_smiles(v):
+    #         raise ValueError("Invalid SMILES string")
+    #     return v
+    
+    # @validator("biological_target")
+    # @classmethod
+    # def validate_target(cls, v):
+    #     if v not in TargetTags.get_values():
+    #         raise ValueError("Invalid target")
+    #     return v
+    
+    # @validator("property")
+    # @classmethod
+    # def validate_property(cls, v):
+    #     if v not in ASAPMLModelRegistry.get_endpoints():
+    #         raise ValueError("Invalid property")
+    #     return v
+    
+
+def _and_join(lst):
+    return " and ".join(lst)
+
+def _make_ml_prompt_template() -> PromptTemplate:
+    """
+    Create a prompt template for the ASAPMLModelQuery model
+    """
+    # join to make a string with "and" between each
+    targets_w_models = ASAPMLModelRegistry.get_targets_with_models()
+    target_str = _and_join(targets_w_models)
+    properties = _and_join(ASAPMLModelRegistry.get_endpoints())
+
+    return PromptTemplate(
+    template="You are an expert scientist, parse the following making sure all SMILES strings are represented exactly as in the input: Be very careful and use only SMILES already in the prompt. Allowed variables for target are {targets} and for property are {properties} : {query}",
+    input_variables=["query"],
+    partial_variables={"properties": properties, "targets": target_str}
+)
+
+_ML_PROMPT_TEMPLATE = _make_ml_prompt_template()
+
+
+
+
+class StructuredLLMQuery:
+
+    def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate,  openai_model="gpt-4o",):
+        """
+        """
+        self.openai_model = openai_model
+        self.pydantic_model = pydantic_model
+        self.prompt_template = prompt_template
+        # get openai api key
+        openai_api_key = os.getenv("OPENAI_API_KEY")
+        if openai_api_key is None:
+            raise ValueError("OPENAI_API_KEY environment variable is not set")
+
+        llm = ChatOpenAI(model=self.openai_model)
+
+        structured_llm =  llm.with_structured_output(self.pydantic_model)
+        self.chain  = prompt_template |  structured_llm
+
+    def query(self, query: str):
+        try:
+            parsed_model = self.chain.invoke({'query': query})
+            return True, parsed_model
+        except Exception as e:
+            print(e)
+            return False, None
+
+_BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE)
\ No newline at end of file
diff --git a/falcbot/util.py b/falcbot/util.py
new file mode 100644
index 0000000..0aedddc
--- /dev/null
+++ b/falcbot/util.py
@@ -0,0 +1,13 @@
+from rdkit import Chem
+
+def _is_valid_smiles(smi):
+    m = Chem.MolFromSmiles(smi)
+    if m is None:
+        return False
+    else:
+        return True
+
+
+def _rdkit_smiles_roundtrip(smi: str) -> str:
+    mol = Chem.MolFromSmiles(smi)
+    return Chem.MolToSmiles(mol)

From 63b007917e9a7beefde88f4791266ef44a4dea14 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Wed, 18 Sep 2024 21:01:50 +1000
Subject: [PATCH 03/13] current_state

---
 devtools/conda-envs/falcbot.yaml |  7 +++++--
 falcbot/falcbot.py               | 12 ++++++------
 falcbot/llm.py                   |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index c8707fd..9f0410b 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -20,6 +20,11 @@ dependencies:
   - pydantic<2.0a0
   - biopython
 
+  # llm 
+
+  - langchain <0.0.267,
+  - langchain-core <0.0.267,
+
 
   # ml
   - pytorch
@@ -67,8 +72,6 @@ dependencies:
     - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
 
       # llm
-    - langchain
-    - langchain_core
     - langchain_openai
 
 
diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index f999d19..b19ff77 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -28,8 +28,8 @@
 from asapdiscovery.ml.inference import GATInference, SchnetInference
 from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
 from asapdiscovery.ml.models import ASAPMLModelRegistry
-from .llm import _BASIC_ML_LLM
-from .util import _rdkit_smiles_roundtrip, _is_valid_smiles
+import llm
+import util
 
 # from falcbot.sqlite_db import connect_sqlite_db, insert_series, create_series_table
 
@@ -513,7 +513,7 @@ def pred_matcher(event, logger, context):
 def make_pic50_pred(event, say, context, logger):
     content = event.get("text")
     # parse with LLM
-    worked, model = _BASIC_ML_LLM.query(content)
+    worked, model = llm._BASIC_ML_LLM.query(content)
     if not worked:
         say("Failed to parse the message, try something like `predict pIC50 for SMILES <smiles> for target <target>`")
         return
@@ -523,7 +523,7 @@ def make_pic50_pred(event, say, context, logger):
     target = model.biological_target
     endpoint = model.property # llm found property better
 
-    if not _is_valid_smiles(smiles):
+    if not util._is_valid_smiles(smiles):
         say(f"Invalid SMILES {smiles}, unable to proceed")
         return
     
@@ -540,7 +540,7 @@ def make_pic50_pred(event, say, context, logger):
         return
     
     
-    smiles = _rdkit_smiles_roundtrip(smiles)
+    smiles = util._rdkit_smiles_roundtrip(smiles)
     gs = GATInference.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint)
     pred = gs.predict_from_smiles(smiles)
     say(
@@ -663,7 +663,7 @@ def help(say, context, event, logger):
     say(
         "you asked for help or misspelt a command, I can help you with the following commands:"
     )
-    # say("* `@falcbot run FEC on series <series_name>`")
+    say("* `@falcbot run FEC on series <series_name>`")
     say("* `@falcbot predict pIC50 for SMILES <smiles> for target <target>`")
     say("* `@falcbot predict pIC50 for structure for target <target>`")
     say("* `@falcbot list valid targets`")
diff --git a/falcbot/llm.py b/falcbot/llm.py
index c9ec092..41262ec 100644
--- a/falcbot/llm.py
+++ b/falcbot/llm.py
@@ -4,7 +4,7 @@
 from asapdiscovery.ml.models import ASAPMLModelRegistry
 from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
 
-from  .util import _is_valid_smiles
+import util 
 
 
 

From e952cd8e7d1071e493e8b6da9c672f6c07f0c855 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Wed, 18 Sep 2024 21:17:09 +1000
Subject: [PATCH 04/13] try llamaindex instead

---
 devtools/conda-envs/falcbot.yaml |  7 +------
 falcbot/llm.py                   | 24 +++++++++++-------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index 9f0410b..d7208b7 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -21,9 +21,7 @@ dependencies:
   - biopython
 
   # llm 
-
-  - langchain <0.0.267,
-  - langchain-core <0.0.267,
+  - llama-index
 
 
   # ml
@@ -71,7 +69,4 @@ dependencies:
     - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
     - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
 
-      # llm
-    - langchain_openai
-
 
diff --git a/falcbot/llm.py b/falcbot/llm.py
index 41262ec..fa2565a 100644
--- a/falcbot/llm.py
+++ b/falcbot/llm.py
@@ -1,11 +1,10 @@
-from langchain_core.prompts import PromptTemplate
 import os
-from langchain_openai import ChatOpenAI
 from asapdiscovery.ml.models import ASAPMLModelRegistry
 from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
 
 import util 
-
+from llama_index.core.program import LLMTextCompletionProgram
+from llama_index.core import PromptTemplate
 
 
 from pydantic import BaseModel, Field, validator
@@ -46,6 +45,8 @@ class ASAPMLModelQuery(BaseModel):
 def _and_join(lst):
     return " and ".join(lst)
 
+_base_ml_prompt_template = "You are an expert scientist, parse the following making sure all SMILES strings are represented exactly as in the input: Be very careful and use only SMILES already in the prompt. Allowed variables for target are {targets} and for property are {properties} : {query}"
+
 def _make_ml_prompt_template() -> PromptTemplate:
     """
     Create a prompt template for the ASAPMLModelQuery model
@@ -55,11 +56,7 @@ def _make_ml_prompt_template() -> PromptTemplate:
     target_str = _and_join(targets_w_models)
     properties = _and_join(ASAPMLModelRegistry.get_endpoints())
 
-    return PromptTemplate(
-    template="You are an expert scientist, parse the following making sure all SMILES strings are represented exactly as in the input: Be very careful and use only SMILES already in the prompt. Allowed variables for target are {targets} and for property are {properties} : {query}",
-    input_variables=["query"],
-    partial_variables={"properties": properties, "targets": target_str}
-)
+    return _base_ml_prompt_template.partial_format(targets=target_str, properties=properties)
 
 _ML_PROMPT_TEMPLATE = _make_ml_prompt_template()
 
@@ -68,7 +65,7 @@ def _make_ml_prompt_template() -> PromptTemplate:
 
 class StructuredLLMQuery:
 
-    def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate,  openai_model="gpt-4o",):
+    def __init__(self, pydantic_model: BaseModel, prompt_template: str,  openai_model="gpt-4o",):
         """
         """
         self.openai_model = openai_model
@@ -79,14 +76,15 @@ def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate,
         if openai_api_key is None:
             raise ValueError("OPENAI_API_KEY environment variable is not set")
 
-        llm = ChatOpenAI(model=self.openai_model)
+        self.program = LLMTextCompletionProgram.from_defaults(
+            output_cls=self.pydantic_model,
+            prompt_template_str=self.prompt_template,
+            verbose=True)
 
-        structured_llm =  llm.with_structured_output(self.pydantic_model)
-        self.chain  = prompt_template |  structured_llm
 
     def query(self, query: str):
         try:
-            parsed_model = self.chain.invoke({'query': query})
+            parsed_model = self.program(query=query)
             return True, parsed_model
         except Exception as e:
             print(e)

From 8da74d1279bab9bf054ecf838863822b07608cca Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 17:14:22 +1000
Subject: [PATCH 05/13] working

---
 devtools/conda-envs/falcbot.yaml | 10 +++++-----
 falcbot/falcbot.py               |  9 +++++----
 falcbot/llm.py                   | 28 ++++++++++++++++++----------
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index d7208b7..8750a97 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -63,10 +63,10 @@ dependencies:
 
     # Pip-only installs
   - pip:
-    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy
-    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-data&subdirectory=asapdiscovery-data
-    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling
-    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
-    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
+    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy
+    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-data&subdirectory=asapdiscovery-data
+    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling
+    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
+    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
 
 
diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index b19ff77..5056dfd 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -515,7 +515,7 @@ def make_pic50_pred(event, say, context, logger):
     # parse with LLM
     worked, model = llm._BASIC_ML_LLM.query(content)
     if not worked:
-        say("Failed to parse the message, try something like `predict pIC50 for SMILES <smiles> for target <target>`")
+        say("Failed to parse the message, try something like `predict pIC50 for SMILES for pro for target <target>`")
         return
     
     # get the SMILES, target and property as parsed by the LLM
@@ -541,10 +541,11 @@ def make_pic50_pred(event, say, context, logger):
     
     
     smiles = util._rdkit_smiles_roundtrip(smiles)
-    gs = GATInference.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint)
-    pred = gs.predict_from_smiles(smiles)
+    model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint)
+    infr = GATInference.from_ml_model_spec(model)
+    pred = infr.predict_from_smiles(smiles)
     say(
-        f"Predicted {endpoint} for {smiles} is {pred:.2f} using model {gs.model_name} :test_tube:"
+        f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:"
     )
 
     # TODO make pred for every target if none specified
diff --git a/falcbot/llm.py b/falcbot/llm.py
index fa2565a..0bd14a3 100644
--- a/falcbot/llm.py
+++ b/falcbot/llm.py
@@ -5,6 +5,7 @@
 import util 
 from llama_index.core.program import LLMTextCompletionProgram
 from llama_index.core import PromptTemplate
+from llama_index.llms.openai import OpenAI
 
 
 from pydantic import BaseModel, Field, validator
@@ -53,10 +54,13 @@ def _make_ml_prompt_template() -> PromptTemplate:
     """
     # join to make a string with "and" between each
     targets_w_models = ASAPMLModelRegistry.get_targets_with_models()
+    # filter out None values
+    targets_w_models = [t for t in targets_w_models if t is not None]        
     target_str = _and_join(targets_w_models)
     properties = _and_join(ASAPMLModelRegistry.get_endpoints())
-
-    return _base_ml_prompt_template.partial_format(targets=target_str, properties=properties)
+    pt = PromptTemplate(_base_ml_prompt_template)
+    formatted =  pt.partial_format(targets=target_str, properties=properties)
+    return formatted
 
 _ML_PROMPT_TEMPLATE = _make_ml_prompt_template()
 
@@ -65,7 +69,7 @@ def _make_ml_prompt_template() -> PromptTemplate:
 
 class StructuredLLMQuery:
 
-    def __init__(self, pydantic_model: BaseModel, prompt_template: str,  openai_model="gpt-4o",):
+    def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate,  openai_model="gpt-4o",):
         """
         """
         self.openai_model = openai_model
@@ -76,18 +80,22 @@ def __init__(self, pydantic_model: BaseModel, prompt_template: str,  openai_mode
         if openai_api_key is None:
             raise ValueError("OPENAI_API_KEY environment variable is not set")
 
+        llm = OpenAI(model=self.openai_model)
+
         self.program = LLMTextCompletionProgram.from_defaults(
             output_cls=self.pydantic_model,
-            prompt_template_str=self.prompt_template,
+            prompt=self.prompt_template,
+            llm=llm,
             verbose=True)
 
 
     def query(self, query: str):
-        try:
-            parsed_model = self.program(query=query)
-            return True, parsed_model
-        except Exception as e:
-            print(e)
-            return False, None
+        # try:
+        parsed_model = self.program(query=query)
+        return True, parsed_model
+        
+        # except Exception as e:
+        #     print(e)
+        #     return False, None
 
 _BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE)
\ No newline at end of file

From d66bf5c9bf1b0a70951f227e9add3549778f6dcb Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 17:21:51 +1000
Subject: [PATCH 06/13] endpoint matcher

---
 falcbot/falcbot.py | 358 +++------------------------------------------
 1 file changed, 21 insertions(+), 337 deletions(-)

diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index 5056dfd..50ec1a3 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -167,337 +167,6 @@ def are_you_alive(event, say, context, logger):
     say(f"yes im alive!")
 
 
-def query_all_networks_matcher(event, logger, context):
-    # regex for any instance of help, case insensitive with optional spaces
-    msg = event.get("text", None)
-    if not msg:
-        return False
-    pattern = r"(?i)query all networks"
-    match = re.search(pattern, msg)
-    return match
-
-
-@app.event("app_mention", matchers=[query_all_networks_matcher])
-def query_all_networks(event, say, context, logger):
-    logger.debug("Querying all networks")
-    client = AlchemiscaleHelper()
-    scope_status_dict = client._client.get_scope_status(visualize=False)
-    for k, v in scope_status_dict.items():
-        say(f"Status {k} has count {v}")
-
-    say("________________________________")
-    say("Checking for running networks...")
-
-    running_networks = client._client.query_networks()
-
-    if not running_networks:
-        say("No networks are running currently")
-        return
-
-    networks_status = client._client.get_networks_status(running_networks)
-    networks_actioned_tasks = client._client.get_networks_actioned_tasks(
-        running_networks
-    )
-
-    for key, network_status, actioned_tasks in zip(
-        running_networks, networks_status, networks_actioned_tasks
-    ):
-        if (
-            "running" in network_status or "waiting" in network_status
-        ) and actioned_tasks:
-            say(f"Network {key} has following status breakdown")
-            state_breakdown = ""
-            for state in _status_keys:
-                state_breakdown += f"{state}: {network_status.get(state, 0)} "
-            say(state_breakdown)
-            say("________________________________")
-    say("Done :smile:")
-
-
-def run_fec_matcher(event, logger, context):
-    # regex for any instance of help, case insensitive with optional spaces
-    msg = event.get("text", None)
-    if not msg:
-        return False
-    pattern = r"(?i)run FEC"
-    match = re.search(pattern, msg)
-    return match
-
-
-@app.event("app_mention", matchers=[run_fec_matcher])
-def run_fec(event, say, context, logger):
-    logger.info("Planning and submitting from postera")
-    say(
-        "Preparing your calculation, please wait this may take a while, ... :ghost: :ghost: :ghost:"
-    )
-    content = event.get("text")
-    # parse message for molset using regex
-    pattern = r"on series\s+.*?(\b[^\s]+\b)+"
-    match = re.search(pattern, content)
-    if match:
-        postera_molset_name = match.group(1)
-        logger.info(f"Postera molecule set name is {postera_molset_name}")
-    else:
-        say(
-            "Could not find postera molecule set name in the message, unable to proceed"
-        )
-        return
-
-    campaign = "confidential"
-
-    # check for attatched file
-    files = event.get("files")
-    if not files:
-        logger.info("No file attatched, unable to proceed")
-        say("No receptor file attatched, unable to proceed")
-        return
-    else:
-        if len(files) > 1:
-            logger.info("More than one file attatched, unable to proceed")
-            say("More than one file attatched, unable to proceed")
-            return
-        # get the first file
-        file = files[0]
-        # check if it is a pdb file
-        file_extn = file.get("title").split(".")[-1]
-        if file_extn != "pdb":
-            say("Attatched file is not a pdb file, unable to proceed")
-            return
-
-    # load ligands from postera
-    try:
-        input_ligands = PosteraFactory(molecule_set_name=postera_molset_name).pull()
-    except Exception as e:
-        say(f"Failed to pull ligands from postera with error: {e}")
-        return
-
-    say(
-        f"Input series has {len(input_ligands)} ligands, this may take a while to process. I'll let you know once its running. Please be patient :ghost: :ghost: :ghost:"
-    )
-    fixed_ligands = []
-    # add hydrogens to ligands
-    for ligand in input_ligands:
-        mol = ligand.to_oemol()
-        oechem.OEAddExplicitHydrogens(mol)
-        fixed_ligands.append(Ligand.from_oemol(mol))
-    input_ligands = fixed_ligands
-    # create dataset name
-    dataset_name = postera_molset_name.replace("-", "_") + "_" + "FALCBot"
-    project = dataset_name
-
-    # run prep workflow
-    logger.info("Running prep workflow")
-
-    prep_factory = AlchemyPrepWorkflow()
-
-    # load receptor from attatched file
-    # read into temp file
-    # TODO move to pre-prepped PDBs hosted on the cloud instance and pull from there
-    try:
-        with NamedTemporaryFile(suffix=".pdb") as temp:
-            logger.info(f"file: {file.get('url_private_download')}")
-            _download_slack_file(file.get("url_private_download"), temp.name)
-            ref_complex = Complex.from_pdb(
-                temp.name,
-                target_kwargs={"target_name": f"{dataset_name}_receptor"},
-                ligand_kwargs={"compound_name": f"{dataset_name}_receptor_ligand"},
-            )
-    except Exception as e:
-        say(f"Failed to load receptor from attatched file with error: {e}")
-        return
-    # prep the complex
-    logger.info("Prepping complex")
-    prepped_ref_complex = PreppedComplex.from_complex(ref_complex)
-
-    import time
-
-    logger.info("Creating alchemy dataset")
-    processors = cpu_count() - 1
-    logger.info(f"Using {processors} processors")
-    start_time = time.time()
-    alchemy_dataset = prep_factory.create_alchemy_dataset(
-        dataset_name=dataset_name,
-        ligands=input_ligands,
-        reference_complex=prepped_ref_complex,
-        processors=processors,
-    )
-    end_time = time.time()
-    execution_time = end_time - start_time
-    logger.info(f"Time taken to create alchemy dataset: {execution_time} seconds")
-
-    # check for failed ligands
-    logger.info("Checking for failed ligands")
-    if alchemy_dataset.failed_ligands:
-        fails = sum([len(values) for values in alchemy_dataset.failed_ligands.values()])
-        say(f"Failed to prep {fails} ligands")
-        # add more detail
-
-    # we have our working ligands
-    posed_ligands = alchemy_dataset.posed_ligands
-
-    # ok now onto  actual network creation
-    logger.info("Creating factory and planned network")
-    factory = FreeEnergyCalculationFactory()
-
-    # create receptor
-    # write to a temp pdb file and read back in
-    with NamedTemporaryFile(suffix=".pdb") as fp:
-        alchemy_dataset.reference_complex.target.to_pdb_file(fp.name)
-        receptor = ProteinComponent.from_pdb_file(fp.name)
-
-    # create factory
-    logger.info("Planning network with factory and planned network")
-    planned_network = factory.create_fec_dataset(
-        dataset_name=dataset_name,
-        receptor=receptor,
-        ligands=posed_ligands,
-        central_ligand=None,
-        experimental_protocol=None,
-    )
-
-    # we want to return links to the factory and planned network
-    # we do this through artifacts in a cloudfront exposed bucket
-    cf = CloudFront.from_settings(CloudfrontSettings())
-    s3 = S3.from_settings(S3Settings())
-
-    # push factory to cloudfront exposed bucket
-    factory_fname = f"fec_factory-{dataset_name}.json"
-    factory_bucket_path = f"alchemy/{dataset_name}/{factory_fname}"
-    with NamedTemporaryFile() as temp:
-        factory.to_file(filename=temp.name)
-        factory_cf_url = _push_to_s3_with_cloudfront(
-            s3, cf, factory_bucket_path, temp.name, content_type="application/json"
-        )
-
-    planned_network_fname = f"planned_network-{dataset_name}.json"
-    planned_network_bucket_path = f"alchemy/{dataset_name}/{planned_network_fname}"
-    # push planned network to cloudfront exposed bucket
-    with NamedTemporaryFile() as temp:
-        planned_network.to_file(filename=temp.name)
-        planned_network_cf_url = _push_to_s3_with_cloudfront(
-            s3,
-            cf,
-            planned_network_bucket_path,
-            temp.name,
-            content_type="application/json",
-        )
-
-    ligands_fname = f"ligands-{dataset_name}.sdf"
-    ligands_fname_bucket_path = f"alchemy/{dataset_name}/{ligands_fname}"
-    # push planned network to cloudfront exposed bucket
-    with NamedTemporaryFile(suffix=".sdf") as temp:
-        alchemy_dataset.save_posed_ligands(temp.name)
-        ligand_cf_url = _push_to_s3_with_cloudfront(
-            s3,
-            cf,
-            ligands_fname_bucket_path,
-            temp.name,
-            content_type="text/plain",
-        )
-
-    receptor_fname = f"receptor-{dataset_name}.pdb"
-    receptor_fname_bucket_path = f"alchemy/{dataset_name}/{receptor_fname}"
-    # push planned network to cloudfront exposed bucket
-    with NamedTemporaryFile(suffix=".pdb") as temp:
-        alchemy_dataset.reference_complex.target.to_pdb_file(temp.name)
-        receptor_cf_url = _push_to_s3_with_cloudfront(
-            s3,
-            cf,
-            receptor_fname_bucket_path,
-            temp.name,
-            content_type="text/plain",
-        )
-
-    logger.info(f"Data set name: {dataset_name}")
-    logger.info(f"Factory url: {factory_cf_url}")
-    logger.info(f"Planned network url: {planned_network_cf_url}")
-    logger.info(f"Ligands url: {ligand_cf_url}")
-    logger.info(f"Receptor url: {receptor_cf_url}")
-
-    # submit the network
-    client = AlchemiscaleHelper()
-
-    network_scope = Scope(org="asap", campaign=campaign, project=project)
-    submitted_network = client.create_network(
-        planned_network=planned_network, scope=network_scope
-    )
-    task_ids = client.action_network(
-        planned_network=submitted_network, prioritize=False
-    )
-    logger.debug(
-        f"Submitted network {submitted_network.results.network_key} with task ids {task_ids} to campaign {campaign} and project {project}."
-    )
-    # except Exception as e:
-    #     say(f"Failed to submit network with error: {e}")
-    #     return
-
-    insert_series(
-        db_connection,
-        dataset_name,
-        factory_cf_url,
-        planned_network_cf_url,
-        ligand_cf_url,
-        receptor_cf_url,
-    )
-
-    say(
-        f"Simulations are running! :rocket: :rocket: :rocket: Your project name is: {project}, to debug use `@falcbot debug series {dataset_name}`"
-    )
-
-
-def debug_series_matcher(event, logger, context):
-    # regex for any instance of help, case insensitive with optional spaces
-    msg = event.get("text", None)
-    if not msg:
-        return False
-    pattern = r"(?i)debug series"
-    match = re.search(pattern, msg)
-    return match
-
-
-@app.event("app_mention", matchers=[debug_series_matcher])
-def debug_series(event, say, context, logger):
-    message = event.get("text")
-    pattern = r"series\s+.*?(\b[^\s]+\b)+"
-    match = re.search(pattern, message)
-    if match:
-        series_name = match.group(1)
-        logger.info(f"Series name is {series_name}")
-    else:
-        say("Could not find series name in the message, unable to proceed")
-        return
-
-    # query the database
-    series = query_series_by_name(db_connection, series_name)
-    if not series:
-        say(f"Series {series_name} not found in the database, unable to proceed")
-        return
-    say(f"Series {series_name} found with values: {series}")
-
-    ligand_cf_url = series[4]
-    receptor_cf_url = series[5]
-    factory_cf_url = series[2]
-    planned_network_cf_url = series[3]
-
-    # make block data from the links
-    block_data = [
-        {
-            "type": "section",
-            "text": {
-                "type": "mrkdwn",
-                "text": "Links to your debugging info :pill: :pill: :pill:",
-            },
-        },
-        _link_to_block_data(ligand_cf_url, "Ligand SDF file"),
-        _link_to_block_data(receptor_cf_url, "Receptor PDB file"),
-        _link_to_block_data(factory_cf_url, "FECFactory JSON"),
-        _link_to_block_data(planned_network_cf_url, "PlannedNetwork JSON"),
-    ]
-
-    say("Links to your debugging info:", blocks=block_data)
-
-    return
-
 
 def pred_matcher(event, logger, context):
     # regex for any instance of help, case insensitive with optional spaces
@@ -515,7 +184,7 @@ def make_pic50_pred(event, say, context, logger):
     # parse with LLM
     worked, model = llm._BASIC_ML_LLM.query(content)
     if not worked:
-        say("Failed to parse the message, try something like `predict pIC50 for SMILES for pro for target <target>`")
+        say("Failed to parse the message, try something like `I would like to predict pIC50 for compound CCCC for MERS`")
         return
     
     # get the SMILES, target and property as parsed by the LLM
@@ -640,6 +309,22 @@ def list_all_targets(say, context, logger):
     return
 
 
+def list_endpoints_matcher(event, logger, context):
+    # regex for any instance of help, case insensitive with optional spaces
+    msg = event.get("text", None)
+    if not event:
+        return False
+    pattern = r"(?i)list valid endpoints"
+    match = re.search(pattern, msg)
+    return match
+
+
+@app.event("app_mention", matchers=[list_targets_matcher])
+def list_endpoints(say, context, logger):
+    say(f"Endpoints: {ASAPMLModelRegistry.get_endpoints()}")
+    return
+
+
 def help_matcher(event, logger, context):
     # regex for any instance of help, case insensitive with optional spaces
     msg = event.get("text", None)
@@ -650,6 +335,7 @@ def help_matcher(event, logger, context):
     return match
 
 
+
 @app.event("app_mention", matchers=[help_matcher])
 def help_with_msg(say, context, event, logger):
     help(say, context, event, logger)
@@ -662,14 +348,12 @@ def help_on_mention(say, context, event, logger):
 
 def help(say, context, event, logger):
     say(
-        "you asked for help or misspelt a command, I can help you with the following commands:"
+        "you asked for help or misspelt a command, I can help you with the following commands:\n"
     )
-    say("* `@falcbot run FEC on series <series_name>`")
-    say("* `@falcbot predict pIC50 for SMILES <smiles> for target <target>`")
+    say("* `@falcbot predict <endpoint> for compound <smiles> for <target>`")
     say("* `@falcbot predict pIC50 for structure for target <target>`")
     say("* `@falcbot list valid targets`")
-    say("* `@falcbot query all networks`")
-    say("* `@falcbot debug series <series_name>`")
+    say("* `@falcbot list valid endpoints`")
     say("* `@falcbot are you alive`")
     say("* `@falcbot help`")
 

From 5c7d2fe52fadb129e47868975c942e4a9993c4be Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:04:19 +1000
Subject: [PATCH 07/13] working?

---
 falcbot/falcbot.py | 106 +++++++++++----------------------------------
 falcbot/llm.py     |  49 ++++++++-------------
 2 files changed, 45 insertions(+), 110 deletions(-)

diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index 50ec1a3..01f47b3 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -173,9 +173,11 @@ def pred_matcher(event, logger, context):
     msg = event.get("text", None)
     if not msg:
         return False
-    pattern = r"(?i)predict"
-    match = re.search(pattern, msg)
-    return match
+    worked, model = llm._IS_ML_QUERY_LLM.query(msg)
+    if not worked:
+        return False
+    if worked:
+        return model.value
 
 
 @app.event("app_mention", matchers=[pred_matcher])
@@ -192,6 +194,7 @@ def make_pic50_pred(event, say, context, logger):
     target = model.biological_target
     endpoint = model.property # llm found property better
 
+
     if not util._is_valid_smiles(smiles):
         say(f"Invalid SMILES {smiles}, unable to proceed")
         return
@@ -208,89 +211,30 @@ def make_pic50_pred(event, say, context, logger):
         )
         return
     
+    _global_model = False
+
+    if not ASAPMLModelRegistry.endpoint_has_target(endpoint):
+        _target = None
+        _global_model = True
+    else:
+        _target = target
+
+    
     
     smiles = util._rdkit_smiles_roundtrip(smiles)
-    model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(target, "GAT", endpoint)
+    model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(_target, "GAT", endpoint)
+    if model is None:
+        say(f"No model found for {target} {endpoint}")
+        return
     infr = GATInference.from_ml_model_spec(model)
     pred = infr.predict_from_smiles(smiles)
     say(
-        f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:"
+        f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "")
     )
-
     # TODO make pred for every target if none specified
 
+    return 
 
-def make_structural_pred_matcher(event, logger, context):
-    # regex for any instance of help, case insensitive with optional spaces
-    msg = event.get("text", None)
-    if not msg:
-        return False
-    pattern = r"(?i)predict pIC50 for structure"
-    match = re.search(pattern, msg)
-    return match
-
-
-@app.event("app_mention", matchers=[make_structural_pred_matcher])
-def make_structural_pred(event, say, context, logger):
-    content = event.get("text")
-    # parse message for molset using regex
-    pattern = r"(?i)\s+for\s+target\s+(.+)"
-    match = re.search(pattern, content)
-    if match:
-        target = match.group(1)
-    else:
-        say("Could not find Target in the message, unable to proceed")
-        return
-
-    allowed_targets = list(
-        set(ASAPMLModelRegistry.get_targets_with_models()) - {"SARS-CoV-2-Mac1"}
-    )  # remove SARS-CoV-2-Mac1, currently not supported
-
-    if not target in allowed_targets:
-        say(f"Invalid target {target}, not in: {allowed_targets}; unable to proceed")
-        return
-
-    # check for attatched file
-    files = event.get("files")
-    if not files:
-        logger.info("No file attatched, unable to proceed")
-        say("No pdb file attatched, unable to proceed")
-        return
-    else:
-        if len(files) > 1:
-            logger.info("More than one file attatched, unable to proceed")
-            say("More than one file attatched, unable to proceed")
-            return
-        # get the first file
-        file = files[0]
-        title = file.get("title")
-        # check if it is a pdb file
-        file_extn = file.get("title").split(".")[-1]
-        if file_extn != "pdb":
-            say("Attatched file is not a pdb file, unable to proceed")
-            return
-
-    try:
-        with NamedTemporaryFile(suffix=".pdb") as temp:
-            logger.info(f"file: {file.get('url_private_download')}")
-            _download_slack_file(file.get("url_private_download"), temp.name)
-            ref_complex = Complex.from_pdb(
-                temp.name,
-                target_kwargs={"target_name": f"receptor"},
-                ligand_kwargs={"compound_name": f"receptor_ligand"},
-            )
-    except Exception as e:
-        say(f"Failed to load receptor from attatched file with error: {e}")
-        return
-
-    # make prediction
-    si = SchnetInference.from_latest_by_target(target)
-    pred = si.predict_from_oemol(ref_complex.to_combined_oemol())
-    say(
-        f"Predicted pIC50 for {title} is {pred:.2f} using model {si.model_name} :test_tube:"
-    )
-
-    # TODO make pred for every target if none specified
 
 
 def list_targets_matcher(event, logger, context):
@@ -305,7 +249,10 @@ def list_targets_matcher(event, logger, context):
 
 @app.event("app_mention", matchers=[list_targets_matcher])
 def list_all_targets(say, context, logger):
-    say(f"Targets: {ASAPMLModelRegistry.get_targets_with_models()}")
+    targets = ASAPMLModelRegistry.get_targets_with_models()
+    # filter out None values
+    targets = [t for t in targets if t is not None]
+    say(f"Targets: {targets}")
     return
 
 
@@ -319,7 +266,7 @@ def list_endpoints_matcher(event, logger, context):
     return match
 
 
-@app.event("app_mention", matchers=[list_targets_matcher])
+@app.event("app_mention", matchers=[list_endpoints_matcher])
 def list_endpoints(say, context, logger):
     say(f"Endpoints: {ASAPMLModelRegistry.get_endpoints()}")
     return
@@ -351,7 +298,6 @@ def help(say, context, event, logger):
         "you asked for help or misspelt a command, I can help you with the following commands:\n"
     )
     say("* `@falcbot predict <endpoint> for compound <smiles> for <target>`")
-    say("* `@falcbot predict pIC50 for structure for target <target>`")
     say("* `@falcbot list valid targets`")
     say("* `@falcbot list valid endpoints`")
     say("* `@falcbot are you alive`")
diff --git a/falcbot/llm.py b/falcbot/llm.py
index 0bd14a3..38657da 100644
--- a/falcbot/llm.py
+++ b/falcbot/llm.py
@@ -19,29 +19,12 @@ class ASAPMLModelQuery(BaseModel):
     property: str = Field(..., description="Measured property for the compound")
 
 
-    # VALIDATE IN slack function to give feedback?
-
-    # @validator("SMILES")
-    # @classmethod
-    # def validate_smiles(cls, v):
-    #     if not _is_valid_smiles(v):
-    #         raise ValueError("Invalid SMILES string")
-    #     return v
-    
-    # @validator("biological_target")
-    # @classmethod
-    # def validate_target(cls, v):
-    #     if v not in TargetTags.get_values():
-    #         raise ValueError("Invalid target")
-    #     return v
-    
-    # @validator("property")
-    # @classmethod
-    # def validate_property(cls, v):
-    #     if v not in ASAPMLModelRegistry.get_endpoints():
-    #         raise ValueError("Invalid property")
-    #     return v
-    
+class IsMLQuery(BaseModel):
+    """
+    Model that checks if a query is a machine learning query
+    """
+    value: bool = Field(..., description="Boolean value indicating if the query is a machine learning query")
+
 
 def _and_join(lst):
     return " and ".join(lst)
@@ -66,6 +49,10 @@ def _make_ml_prompt_template() -> PromptTemplate:
 
 
 
+_base_is_query_prompt_template = "You are an expert scientist, parse the following and determine if it is a request for a prediction from a machine learning model, look for words like predict, : {query}"
+_IS_ML_QUERY_PROMPT_TEMPLATE = PromptTemplate(_base_is_query_prompt_template)
+
+
 
 class StructuredLLMQuery:
 
@@ -90,12 +77,14 @@ def __init__(self, pydantic_model: BaseModel, prompt_template: PromptTemplate,
 
 
     def query(self, query: str):
-        # try:
-        parsed_model = self.program(query=query)
-        return True, parsed_model
+        try:
+            parsed_model = self.program(query=query)
+            return True, parsed_model
         
-        # except Exception as e:
-        #     print(e)
-        #     return False, None
+        except Exception as e:
+            print(e)
+            return False, None
+
+_BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE)
 
-_BASIC_ML_LLM = StructuredLLMQuery(ASAPMLModelQuery, _ML_PROMPT_TEMPLATE)
\ No newline at end of file
+_IS_ML_QUERY_LLM = StructuredLLMQuery(IsMLQuery, _IS_ML_QUERY_PROMPT_TEMPLATE)
\ No newline at end of file

From 2b122af0f91bc6b271d9bf879ebce0e5e2225013 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:07:06 +1000
Subject: [PATCH 08/13] semi working

---
 devtools/conda-envs/falcbot.yaml | 10 ++--
 falcbot/falcbot.py               | 96 --------------------------------
 2 files changed, 5 insertions(+), 101 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index 8750a97..d7208b7 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -63,10 +63,10 @@ dependencies:
 
     # Pip-only installs
   - pip:
-    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy
-    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-data&subdirectory=asapdiscovery-data
-    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling
-    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
-    - git+https://github.com/choderalab/asapdiscovery@ml_reg_tests_+_bigfix#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-alchemy&subdirectory=asapdiscovery-alchemy
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-data&subdirectory=asapdiscovery-data
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-modeling&subdirectory=asapdiscovery-modeling
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-docking&subdirectory=asapdiscovery-docking
+    - git+https://github.com/choderalab/asapdiscovery@main#egg=asapdiscovery-ml&subdirectory=asapdiscovery-ml
 
 
diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index 01f47b3..9f8bade 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -51,104 +51,8 @@ class SlackSettings(BaseSettings):
     )
 
 
-def connect_sqlite_db(path, check_same_thread=False):
-    connection = None
-    try:
-        connection = sqlite3.connect(path, check_same_thread=check_same_thread)
-        print("Connection to SQLite DB successful")
-    except sqlite3.Error as e:
-        print(f"The error '{e}' occurred")
-
-    return connection
-
-
-def execute_query(connection, query):
-    cursor = connection.cursor()
-    try:
-        cursor.execute(query)
-        connection.commit()
-        print("Query executed successfully")
-    except sqlite3.Error as e:
-        print(f"The error '{e}' occurred")
-
-
-def create_series_table(connection):
-    create_series_table_query = """
-    CREATE TABLE IF NOT EXISTS series (
-        id INTEGER PRIMARY KEY AUTOINCREMENT,
-        name TEXT NOT NULL UNIQUE,
-        factory_url TEXT NOT NULL,
-        planned_network_url TEXT NOT NULL,
-        ligands_url TEXT NOT NULL,
-        receptor_url TEXT NOT NULL
-
-    );
-    """
-    execute_query(connection, create_series_table_query)
-
-
-def insert_series(
-    connection, name, factory_url, planned_network_url, ligands_url, receptor_url
-):
-    insert_series_query = f"""
-    INSERT INTO series (name, factory_url, planned_network_url, ligands_url, receptor_url)
-    VALUES ('{name}', '{factory_url}', '{planned_network_url}', '{ligands_url}', '{receptor_url}');
-    """
-    execute_query(connection, insert_series_query)
-
-
-def query_series_by_name(connection, name):
-    query = f"SELECT * FROM series WHERE name='{name}'"
-    cursor = connection.cursor()
-    cursor.execute(query)
-    # unpack into a dictionary
-    series = cursor.fetchone()
-    print(series)
-    return series
-
-
 settings = SlackSettings()
 app = App(token=settings.SLACK_BOT_TOKEN)
-db_connection = connect_sqlite_db("falcbot.sqlite3", check_same_thread=False)
-create_series_table(db_connection)
-
-
-_status_keys = ["complete", "running", "waiting", "error", "invalid", "deleted"]
-
-
-def _download_slack_file(file_url, file_name):
-    import requests
-
-    headers = {"Authorization": f"Bearer {settings.SLACK_BOT_TOKEN}"}
-
-    response = requests.get(file_url, headers=headers, stream=True)
-    response.raise_for_status()
-    with open(file_name, "wb") as f:
-        for chunk in response.iter_content(chunk_size=2048):
-            f.write(chunk)
-
-
-def _push_to_s3_with_cloudfront(
-    s3_instance: S3,
-    cloudfront_instance: CloudFront,
-    bucket_path: str,
-    file_path: str,
-    expires_delta: timedelta = timedelta(days=365 * 5),
-    content_type: str = "application/json",
-) -> str:
-    # push to s3
-    s3_instance.push_file(file_path, location=bucket_path, content_type=content_type)
-    # generate cloudfront url
-    expiry = datetime.utcnow() + expires_delta
-    return cloudfront_instance.generate_signed_url(bucket_path, expiry)
-
-
-def _link_to_block_data(link, text):
-    return {
-        "type": "section",
-        "text": {"type": "mrkdwn", "text": f"<{link}|{text}>"},
-    }
-
 
 
 

From 851f187317c6eb9038416957379ca2abfb782933 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:20:35 +1000
Subject: [PATCH 09/13] simplify str

---
 falcbot/falcbot.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index 9f8bade..c3e6b3d 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -120,8 +120,10 @@ def make_pic50_pred(event, say, context, logger):
     if not ASAPMLModelRegistry.endpoint_has_target(endpoint):
         _target = None
         _global_model = True
+        _target_str = "global"
     else:
         _target = target
+        _target_str = target
 
     
     
@@ -133,7 +135,7 @@ def make_pic50_pred(event, say, context, logger):
     infr = GATInference.from_ml_model_spec(model)
     pred = infr.predict_from_smiles(smiles)
     say(
-        f"Predicted {target} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "")
+        f"Predicted {_target_str} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "")
     )
     # TODO make pred for every target if none specified
 

From 97761c8384bcc31e057c118064e1e35f293e9a41 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:23:12 +1000
Subject: [PATCH 10/13] cleanup

---
 falcbot/falcbot.py | 29 +----------------------------
 falcbot/llm.py     |  2 --
 2 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index c3e6b3d..3b2a792 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -1,42 +1,15 @@
 import logging
 import re
-import uuid
 import logging
-from datetime import datetime, timedelta
-from tempfile import NamedTemporaryFile
 from pydantic import BaseSettings, Field
 from slack_bolt import App
 from slack_bolt.adapter.socket_mode import SocketModeHandler
-
-from alchemiscale import Scope
-from openfe import ProteinComponent
-from asapdiscovery.alchemy.schema.fec import (
-    FreeEnergyCalculationFactory,
-    AlchemiscaleSettings,
-)
-from asapdiscovery.alchemy.schema.prep_workflow import AlchemyPrepWorkflow
-from asapdiscovery.alchemy.utils import AlchemiscaleHelper
-
-from asapdiscovery.data.schema.complex import Complex, PreppedComplex
-from asapdiscovery.data.schema.ligand import Ligand
-from asapdiscovery.data.backend.openeye import oechem
-from asapdiscovery.data.services.postera.postera_factory import PosteraFactory
-from asapdiscovery.data.services.services_config import CloudfrontSettings, S3Settings
-from asapdiscovery.data.services.aws.cloudfront import CloudFront
-from asapdiscovery.data.services.aws.s3 import S3
-
-from asapdiscovery.ml.inference import GATInference, SchnetInference
-from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
+from asapdiscovery.ml.inference import GATInference
 from asapdiscovery.ml.models import ASAPMLModelRegistry
 import llm
 import util
 
-# from falcbot.sqlite_db import connect_sqlite_db, insert_series, create_series_table
-
-from rdkit import Chem
-import sqlite3
 
-from multiprocessing import cpu_count
 
 # logger in a global context
 logging.basicConfig(level=logging.DEBUG)
diff --git a/falcbot/llm.py b/falcbot/llm.py
index 38657da..f7e768b 100644
--- a/falcbot/llm.py
+++ b/falcbot/llm.py
@@ -1,8 +1,6 @@
 import os
 from asapdiscovery.ml.models import ASAPMLModelRegistry
-from asapdiscovery.data.services.postera.manifold_data_validation import TargetTags
 
-import util 
 from llama_index.core.program import LLMTextCompletionProgram
 from llama_index.core import PromptTemplate
 from llama_index.llms.openai import OpenAI

From 88d947f9a3bfeefa32f2840d367e6b737b8d2434 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:27:21 +1000
Subject: [PATCH 11/13] remove alchemy deps

---
 devtools/conda-envs/falcbot.yaml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index d7208b7..7b90489 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -41,17 +41,6 @@ dependencies:
   - semver
 
 
-  # alchemy
-  - numpy
-  - gufe =>0.9.5
-  - httpx
-  - perses >=0.10.2
-  - kartograf
-  - rich
-  - alchemiscale-client
-  - cinnabar >=0.4.1
-  - openeye-toolkits
-  - openfe
 
   # other asapdiscovery deps
   - distributed

From 3763d73e458c3a0bc67f2792e185205a9fc754d8 Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:31:45 +1000
Subject: [PATCH 12/13] re-add openeye

---
 devtools/conda-envs/falcbot.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/devtools/conda-envs/falcbot.yaml b/devtools/conda-envs/falcbot.yaml
index 7b90489..959304d 100644
--- a/devtools/conda-envs/falcbot.yaml
+++ b/devtools/conda-envs/falcbot.yaml
@@ -39,8 +39,7 @@ dependencies:
   - mtenn >=0.5.1
   - wandb
   - semver
-
-
+  - openeye-toolkits
 
   # other asapdiscovery deps
   - distributed

From c5ddc634dc974399650359dc5fd405a61d22875c Mon Sep 17 00:00:00 2001
From: hmacdope <hugomacdermott@gmail.com>
Date: Thu, 19 Sep 2024 19:46:45 +1000
Subject: [PATCH 13/13] add in errors

---
 falcbot/falcbot.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/falcbot/falcbot.py b/falcbot/falcbot.py
index 3b2a792..cda4942 100644
--- a/falcbot/falcbot.py
+++ b/falcbot/falcbot.py
@@ -2,6 +2,7 @@
 import re
 import logging
 from pydantic import BaseSettings, Field
+import numpy as np
 from slack_bolt import App
 from slack_bolt.adapter.socket_mode import SocketModeHandler
 from asapdiscovery.ml.inference import GATInference
@@ -106,9 +107,13 @@ def make_pic50_pred(event, say, context, logger):
         say(f"No model found for {target} {endpoint}")
         return
     infr = GATInference.from_ml_model_spec(model)
-    pred = infr.predict_from_smiles(smiles)
+    pred, err = infr.predict_from_smiles(smiles, return_err=True)
+    if np.isnan(err):
+        errstr = " "
+    else:
+        errstr = f" ± {err:.2f} "
     say(
-        f"Predicted {_target_str} {endpoint} for {smiles} is {pred:.2f} using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "")
+        f"Predicted {_target_str} {endpoint} for {smiles} is {pred:.2f}{errstr}using model {infr.model_name} :test_tube:" + (" (global model)" if _global_model else "")
     )
     # TODO make pred for every target if none specified