Merge pull request #18 from parkervg/joiner-heuristic-integration

Joiner heuristic integration
parkervg · Jun 1, 2024 · edfc122 · edfc122
2 parents fead7c2 + 86c0fca
commit edfc122
Show file tree

Hide file tree

Showing 5 changed files with 181 additions and 29 deletions.
diff --git a/blendsql/blend.py b/blendsql/blend.py
@@ -81,6 +81,7 @@ def get_from_name(self, name: str):
         )
 
     def extend(self, ingredients: Iterable[Ingredient]) -> None:
+        """ "Initializes ingredients class with base attributes, for use in later operations."""
         try:
             if not all(issubclass(x, Ingredient) for x in ingredients):
                 raise IngredientException(
@@ -95,12 +96,14 @@ def extend(self, ingredients: Iterable[Ingredient]) -> None:
             assert (
                 name not in self.added_ingredient_names
             ), f"Duplicate ingredient names passed! These are case insensitive, be careful.\n{name}"
-            ingredient = ingredient(name=name)
+            ingredient = ingredient(
+                name=name,
+                # Add db and session_uuid as default kwargs
+                # This way, ingredients are able to interact with data
+                db=self.db,
+                session_uuid=self.session_uuid,
+            )
             self.added_ingredient_names.add(name)
-            # Add db and session_uuid as default kwargs
-            # This way, ingredients are able to interact with data
-            ingredient.db = self.db
-            ingredient.session_uuid = self.session_uuid
             self.append(ingredient)
 
 

diff --git a/blendsql/ingredients/ingredient.py b/blendsql/ingredients/ingredient.py
@@ -1,6 +1,8 @@
 from attr import attrs, attrib
-from abc import abstractmethod, ABC
+from abc import abstractmethod
 import pandas as pd
+import json
+from skrub import Joiner
 from typing import (
     Any,
     Iterable,
@@ -14,9 +16,12 @@
     Optional,
 )
 import uuid
+from colorama import Fore
 from typeguard import check_type
+from functools import partialmethod
 
 from .._exceptions import IngredientException
+from .._logger import logger
 from .. import utils
 from .._constants import IngredientKwarg, IngredientType
 from ..db import Database
@@ -30,14 +35,25 @@ def unpack_default_kwargs(**kwargs):
     )
 
 
+def partialclass(cls, *args, **kwds):
+    # https://stackoverflow.com/a/38911383
+    class NewCls(cls):
+        __init__ = partialmethod(cls.__init__, *args, **kwds)
+
+    NewCls.__name__ = cls.__name__
+    return NewCls
+
+
 @attrs
-class Ingredient(ABC):
+class Ingredient:
     name: str = attrib()
+    # Below gets passed via `Kitchen.extend()`
+    db: Database = attrib()
+    session_uuid: str = attrib()
+
     ingredient_type: str = attrib(init=False)
     allowed_output_types: Tuple[Type] = attrib(init=False)
-    # Below gets passed via `Kitchen.extend()`
-    db: Database = attrib(init=False)
-    session_uuid: str = attrib(init=False)
+    num_values_passed: int = 0
 
     def __repr__(self):
         return f"{self.ingredient_type} {self.name}"
@@ -70,7 +86,6 @@ class MapIngredient(Ingredient):
     to each of the given values, creating a new column."""
 
     ingredient_type: str = IngredientType.MAP.value
-    num_values_passed: int = 0
     allowed_output_types: Tuple[Type] = (Iterable[Any],)
 
     def unpack_default_kwargs(self, **kwargs):
@@ -174,10 +189,15 @@ class JoinIngredient(Ingredient):
         {"tomato": "red", "broccoli": "green", "lemon": "yellow"}
     """
 
+    use_skrub_joiner: bool = attrib(default=True)
+
     ingredient_type: str = IngredientType.JOIN.value
-    num_values_passed: int = 0
     allowed_output_types: Tuple[Type] = (dict,)
 
+    @classmethod
+    def from_args(cls, use_skrub_joiner: bool = True):
+        return partialclass(cls, use_skrub_joiner=use_skrub_joiner)
+
     def __call__(
         self,
         question: Optional[str] = None,
@@ -190,7 +210,6 @@ def __call__(
         aliases_to_tablenames: Dict[str, str] = kwargs.get("aliases_to_tablenames")
         get_temp_subquery_table: Callable = kwargs.get("get_temp_subquery_table")
         get_temp_session_table: Callable = kwargs.get("get_temp_session_table")
-
         # Depending on the size of the underlying data, it may be optimal to swap
         #   the order of 'left_on' and 'right_on' columns during processing
         swapped = False
@@ -212,12 +231,15 @@ def __call__(
                 )
             )
             modified_lr_identifiers.append((tablename, colname))
-
+        sorted_values = sorted(values, key=len)
+        # check swapping only once, at the beginning
+        if sorted_values != values:
+            swapped = True
         if question is None:
             # First, check which values we actually need to call Model on
             # We don't want to join when there's already an intuitive alignment
             # First, make sure outer loop is shorter of the two lists
-            outer, inner = sorted(values, key=len)
+            outer, inner = sorted_values
             _outer = []
             inner = set(inner)
             mapping = {}
@@ -230,16 +252,41 @@ def __call__(
                     _outer.append(l)
                 if len(inner) == 0:
                     break
-            to_compare = [inner, _outer]
-        else:
-            to_compare = values
+            # Remained _outer and inner lists preserved the sorting order in length:
+            # len(_outer) = len(outer) - #matched <= len(inner original) - matched = len(inner)
+            if self.use_skrub_joiner and len(inner) > 1:
+                # Create the main_table DataFrame
+                main_table = pd.DataFrame(_outer, columns=["out"])
+                # Create the aux_table DataFrame
+                aux_table = pd.DataFrame(inner, columns=["in"])
+                joiner = Joiner(
+                    aux_table,
+                    main_key="out",
+                    aux_key="in",
+                    max_dist=0.9,
+                    add_match_info=False,
+                )
+                res = joiner.fit_transform(main_table)
+                # Below is essentially set.difference on aux_table and those paired in res
+                inner = aux_table.loc[~aux_table["in"].isin(res["in"]), "in"].tolist()
+                # length(new inner) = length(inner) - #matched by fuzzy join
+                _outer = res["out"][res["in"].isnull()].to_list()
+                # length(new _outer) = length(_outer) - #matched by fuzzy join
+                _mapping = res.dropna(subset=["in"]).set_index("out")["in"].to_dict()
+                logger.debug(
+                    Fore.YELLOW
+                    + "Made the following alignment with `skrub.Joiner`:"
+                    + Fore.RESET
+                )
+                logger.debug(Fore.YELLOW + json.dumps(_mapping, indent=4) + Fore.RESET)
+                mapping = mapping | _mapping
+            # order by length is still preserved regardless of using fuzzy join, so after initial matching and possible fuzzy join matching
+            # This is because the lengths of each list will decrease at the same rate, so whichever list was larger at the beginning,
+            # will be larger here at the end.
+            # len(_outer) <= len(inner)
+            sorted_values = [_outer, inner]
 
-        # Finally, order by new (remaining) length and check if we swapped places from original
-        sorted_values = sorted(to_compare, key=len)
-        if sorted_values != values:
-            swapped = True
         left_values, right_values = sorted_values
-
         kwargs["left_values"] = left_values
         kwargs["right_values"] = right_values
 
@@ -261,7 +308,6 @@ def __call__(
             kwargs[IngredientKwarg.QUESTION] = question
             _mapping: Dict[str, str] = self._run(*args, **kwargs)
             mapping = mapping | _mapping
-
         # Using mapped left/right values, create intermediary mapping table
         temp_join_tablename = get_temp_session_table(str(uuid.uuid4())[:4])
         # Below, we check to see if 'swapped' is True
@@ -290,7 +336,6 @@ def run(self, *args, **kwargs) -> dict:
 @attrs
 class QAIngredient(Ingredient):
     ingredient_type: str = IngredientType.QA.value
-    num_values_passed: int = 0
     allowed_output_types: Tuple[Type] = (Union[str, int, float],)
 
     def __call__(

diff --git a/blendsql/models/remote/_openai.py b/blendsql/models/remote/_openai.py
@@ -1,12 +1,14 @@
 import os
+import importlib.util
 from outlines.models import openai, azure_openai, LogitsGenerator
 from outlines.models.openai import OpenAIConfig
-import tiktoken
 
 from .._model import RemoteModel
 
 DEFAULT_CONFIG = OpenAIConfig(temperature=0.0)
 
+_has_openai = importlib.util.find_spec("openai") is not None
+
 
 def openai_setup() -> None:
     """Setup helper for AzureOpenAI and OpenAI models."""
@@ -83,6 +85,13 @@ def __init__(
         caching: bool = True,
         **kwargs
     ):
+        if not _has_openai:
+            raise ImportError(
+                "Please install openai>=1.0.0 with `pip install openai>=1.0.0`!"
+            ) from None
+
+        import tiktoken
+
         super().__init__(
             model_name_or_path=model_name_or_path,
             tokenizer=tiktoken.encoding_for_model(model_name_or_path),
@@ -99,7 +108,6 @@ def _load_model(self, config: OpenAIConfig, **kwargs) -> LogitsGenerator:
             self.model_name_or_path,
             config=config,
             azure_endpoint=os.getenv("OPENAI_API_BASE"),
-            api_version=os.getenv("OPENAI_API_VERSION"),
             api_key=os.getenv("OPENAI_API_KEY"),
             **kwargs
         )
@@ -145,6 +153,13 @@ def __init__(
         caching: bool = True,
         **kwargs
     ):
+        if not _has_openai:
+            raise ImportError(
+                "Please install openai>=1.0.0 with `pip install openai>=1.0.0`!"
+            ) from None
+
+        import tiktoken
+
         super().__init__(
             model_name_or_path=model_name_or_path,
             tokenizer=tiktoken.encoding_for_model(model_name_or_path),

diff --git a/run_debug.py b/run_debug.py
@@ -0,0 +1,83 @@
+from blendsql import blend, LLMJoin, LLMMap, LLMQA
+from blendsql.db import SQLite
+from blendsql.models import TransformersLLM
+from blendsql.utils import fetch_from_hub
+from tqdm import tqdm
+
+# TEST_QUERIES = [
+#     """
+#     SELECT DISTINCT venue FROM w
+#       WHERE city = 'sydney' AND {{
+#           LLMMap(
+#               'More than 30 total points?',
+#               'w::score'
+#           )
+#       }} = TRUE
+#     """,
+#     """
+#     SELECT * FROM w
+#       WHERE city = {{
+#           LLMQA(
+#               'Which city is located 120 miles west of Sydney?',
+#               (SELECT * FROM documents WHERE documents MATCH 'sydney OR 120'),
+#               options='w::city'
+#           )
+#       }}
+#     """,
+#     """
+#     SELECT date, rival, score, documents.content AS "Team Description" FROM w
+#     JOIN {{
+#         LLMJoin(
+#             left_on='documents::title',
+#             right_on='w::rival'
+#         )
+#     }}
+#     """
+# ]
+
+TEST_QUERIES = [
+    """
+    SELECT title, player FROM w JOIN {{
+        LLMJoin(
+            left_on='documents::title',
+            right_on='w::player'
+        )
+    }}
+    """
+]
+if __name__ == "__main__":
+    """
+    Without cached LLM response (10 runs):
+        before: 3.16
+        after: 1.91
+    With cached LLM response (100 runs):
+        before: 0.0175
+        after: 0.0166
+    "Qwen -1.5-0.5B"
+    With cached LLM response (30 runs):
+        with fuzzy join: 0.431
+        without fuzzy join: 0.073
+    Without cached LLM response (30 runs):
+        with fuzzy join: 0.286
+        without fuzzy join: 318.85
+    """
+    db = SQLite(fetch_from_hub("1966_NBA_Expansion_Draft_0.db"))
+    #db = SQLite(fetch_from_hub("multi_table.db"))
+    TEST_TRANSFORMERS_LLM = "hf-internal-testing/tiny-random-PhiForCausalLM"
+    model = TransformersLLM(TEST_TRANSFORMERS_LLM, caching=False)
+
+    times = []
+    for i in range(30):
+        for q in TEST_QUERIES:
+
+            # Make our smoothie - the executed BlendSQL script
+            smoothie = blend(
+                query=q,
+                db=db,
+                blender=model,
+                verbose=False,
+                ingredients={LLMJoin, LLMMap, LLMQA},
+            )
+            times.append(smoothie.meta.process_time_seconds)
+    print(smoothie.df)
+    print(f"Average time across {len(times)} runs: {sum(times) / len(times)}")
diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def find_version(*file_paths):
     url="https://github.com/parkervg/blendsql",
     author="Parker Glenn",
     author_email="[email protected]",
-    description="Query language to blend SQL logic and LLM reasoning across multi-modal data.",
+    description="Query language for blending SQL logic and LLM reasoning across multi-modal data.",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
     license="Apache License 2.0",
@@ -46,6 +46,12 @@ def find_version(*file_paths):
         "python-dotenv==1.0.1",
         "sqlglot==18.13.0",
         "sqlalchemy>=2.0.0",
+        # skrub doesn't currently support python<3.10: https://github.com/skrub-data/skrub/issues/815
+        "skrub==0.1.0 ; python_version>='3.10'",
+        # We fetch this branch which removes python 3.10 style type annotations instead, then
+        "skrub @ git+https://github.com/jeromedockes/skrub.git@strip-type-annotations ; python_version<'3.10'",
+        # https://github.com/skrub-data/skrub/issues/910
+        "scikit-learn==1.4.2",
         "huggingface_hub",
         "datasets",
         "lark",
@@ -74,7 +80,7 @@ def find_version(*file_paths):
             "recognizers-text-suite",
             "emoji==1.7.0",
         ],
-        "test": ["pytest", "huggingface_hub", "pre-commit"],
+        "test": ["pytest", "pre-commit", "llama-cpp-python", "transformers", "torch"],
         "docs": [
             "mkdocs-material",
             "mkdocstrings",