Skip to content

Commit

Permalink
Merge pull request #18 from parkervg/joiner-heuristic-integration
Browse files Browse the repository at this point in the history
Joiner heuristic integration
  • Loading branch information
parkervg authored Jun 1, 2024
2 parents fead7c2 + 86c0fca commit edfc122
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 29 deletions.
13 changes: 8 additions & 5 deletions blendsql/blend.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def get_from_name(self, name: str):
)

def extend(self, ingredients: Iterable[Ingredient]) -> None:
""" "Initializes ingredients class with base attributes, for use in later operations."""
try:
if not all(issubclass(x, Ingredient) for x in ingredients):
raise IngredientException(
Expand All @@ -95,12 +96,14 @@ def extend(self, ingredients: Iterable[Ingredient]) -> None:
assert (
name not in self.added_ingredient_names
), f"Duplicate ingredient names passed! These are case insensitive, be careful.\n{name}"
ingredient = ingredient(name=name)
ingredient = ingredient(
name=name,
# Add db and session_uuid as default kwargs
# This way, ingredients are able to interact with data
db=self.db,
session_uuid=self.session_uuid,
)
self.added_ingredient_names.add(name)
# Add db and session_uuid as default kwargs
# This way, ingredients are able to interact with data
ingredient.db = self.db
ingredient.session_uuid = self.session_uuid
self.append(ingredient)


Expand Down
85 changes: 65 additions & 20 deletions blendsql/ingredients/ingredient.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from attr import attrs, attrib
from abc import abstractmethod, ABC
from abc import abstractmethod
import pandas as pd
import json
from skrub import Joiner
from typing import (
Any,
Iterable,
Expand All @@ -14,9 +16,12 @@
Optional,
)
import uuid
from colorama import Fore
from typeguard import check_type
from functools import partialmethod

from .._exceptions import IngredientException
from .._logger import logger
from .. import utils
from .._constants import IngredientKwarg, IngredientType
from ..db import Database
Expand All @@ -30,14 +35,25 @@ def unpack_default_kwargs(**kwargs):
)


def partialclass(cls, *args, **kwds):
# https://stackoverflow.com/a/38911383
class NewCls(cls):
__init__ = partialmethod(cls.__init__, *args, **kwds)

NewCls.__name__ = cls.__name__
return NewCls


@attrs
class Ingredient(ABC):
class Ingredient:
name: str = attrib()
# Below gets passed via `Kitchen.extend()`
db: Database = attrib()
session_uuid: str = attrib()

ingredient_type: str = attrib(init=False)
allowed_output_types: Tuple[Type] = attrib(init=False)
# Below gets passed via `Kitchen.extend()`
db: Database = attrib(init=False)
session_uuid: str = attrib(init=False)
num_values_passed: int = 0

def __repr__(self):
return f"{self.ingredient_type} {self.name}"
Expand Down Expand Up @@ -70,7 +86,6 @@ class MapIngredient(Ingredient):
to each of the given values, creating a new column."""

ingredient_type: str = IngredientType.MAP.value
num_values_passed: int = 0
allowed_output_types: Tuple[Type] = (Iterable[Any],)

def unpack_default_kwargs(self, **kwargs):
Expand Down Expand Up @@ -174,10 +189,15 @@ class JoinIngredient(Ingredient):
{"tomato": "red", "broccoli": "green", "lemon": "yellow"}
"""

use_skrub_joiner: bool = attrib(default=True)

ingredient_type: str = IngredientType.JOIN.value
num_values_passed: int = 0
allowed_output_types: Tuple[Type] = (dict,)

@classmethod
def from_args(cls, use_skrub_joiner: bool = True):
return partialclass(cls, use_skrub_joiner=use_skrub_joiner)

def __call__(
self,
question: Optional[str] = None,
Expand All @@ -190,7 +210,6 @@ def __call__(
aliases_to_tablenames: Dict[str, str] = kwargs.get("aliases_to_tablenames")
get_temp_subquery_table: Callable = kwargs.get("get_temp_subquery_table")
get_temp_session_table: Callable = kwargs.get("get_temp_session_table")

# Depending on the size of the underlying data, it may be optimal to swap
# the order of 'left_on' and 'right_on' columns during processing
swapped = False
Expand All @@ -212,12 +231,15 @@ def __call__(
)
)
modified_lr_identifiers.append((tablename, colname))

sorted_values = sorted(values, key=len)
# check swapping only once, at the beginning
if sorted_values != values:
swapped = True
if question is None:
# First, check which values we actually need to call Model on
# We don't want to join when there's already an intuitive alignment
# First, make sure outer loop is shorter of the two lists
outer, inner = sorted(values, key=len)
outer, inner = sorted_values
_outer = []
inner = set(inner)
mapping = {}
Expand All @@ -230,16 +252,41 @@ def __call__(
_outer.append(l)
if len(inner) == 0:
break
to_compare = [inner, _outer]
else:
to_compare = values
# Remained _outer and inner lists preserved the sorting order in length:
# len(_outer) = len(outer) - #matched <= len(inner original) - matched = len(inner)
if self.use_skrub_joiner and len(inner) > 1:
# Create the main_table DataFrame
main_table = pd.DataFrame(_outer, columns=["out"])
# Create the aux_table DataFrame
aux_table = pd.DataFrame(inner, columns=["in"])
joiner = Joiner(
aux_table,
main_key="out",
aux_key="in",
max_dist=0.9,
add_match_info=False,
)
res = joiner.fit_transform(main_table)
# Below is essentially set.difference on aux_table and those paired in res
inner = aux_table.loc[~aux_table["in"].isin(res["in"]), "in"].tolist()
# length(new inner) = length(inner) - #matched by fuzzy join
_outer = res["out"][res["in"].isnull()].to_list()
# length(new _outer) = length(_outer) - #matched by fuzzy join
_mapping = res.dropna(subset=["in"]).set_index("out")["in"].to_dict()
logger.debug(
Fore.YELLOW
+ "Made the following alignment with `skrub.Joiner`:"
+ Fore.RESET
)
logger.debug(Fore.YELLOW + json.dumps(_mapping, indent=4) + Fore.RESET)
mapping = mapping | _mapping
# order by length is still preserved regardless of using fuzzy join, so after initial matching and possible fuzzy join matching
# This is because the lengths of each list will decrease at the same rate, so whichever list was larger at the beginning,
# will be larger here at the end.
# len(_outer) <= len(inner)
sorted_values = [_outer, inner]

# Finally, order by new (remaining) length and check if we swapped places from original
sorted_values = sorted(to_compare, key=len)
if sorted_values != values:
swapped = True
left_values, right_values = sorted_values

kwargs["left_values"] = left_values
kwargs["right_values"] = right_values

Expand All @@ -261,7 +308,6 @@ def __call__(
kwargs[IngredientKwarg.QUESTION] = question
_mapping: Dict[str, str] = self._run(*args, **kwargs)
mapping = mapping | _mapping

# Using mapped left/right values, create intermediary mapping table
temp_join_tablename = get_temp_session_table(str(uuid.uuid4())[:4])
# Below, we check to see if 'swapped' is True
Expand Down Expand Up @@ -290,7 +336,6 @@ def run(self, *args, **kwargs) -> dict:
@attrs
class QAIngredient(Ingredient):
ingredient_type: str = IngredientType.QA.value
num_values_passed: int = 0
allowed_output_types: Tuple[Type] = (Union[str, int, float],)

def __call__(
Expand Down
19 changes: 17 additions & 2 deletions blendsql/models/remote/_openai.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
import importlib.util
from outlines.models import openai, azure_openai, LogitsGenerator
from outlines.models.openai import OpenAIConfig
import tiktoken

from .._model import RemoteModel

DEFAULT_CONFIG = OpenAIConfig(temperature=0.0)

_has_openai = importlib.util.find_spec("openai") is not None


def openai_setup() -> None:
"""Setup helper for AzureOpenAI and OpenAI models."""
Expand Down Expand Up @@ -83,6 +85,13 @@ def __init__(
caching: bool = True,
**kwargs
):
if not _has_openai:
raise ImportError(
"Please install openai>=1.0.0 with `pip install openai>=1.0.0`!"
) from None

import tiktoken

super().__init__(
model_name_or_path=model_name_or_path,
tokenizer=tiktoken.encoding_for_model(model_name_or_path),
Expand All @@ -99,7 +108,6 @@ def _load_model(self, config: OpenAIConfig, **kwargs) -> LogitsGenerator:
self.model_name_or_path,
config=config,
azure_endpoint=os.getenv("OPENAI_API_BASE"),
api_version=os.getenv("OPENAI_API_VERSION"),
api_key=os.getenv("OPENAI_API_KEY"),
**kwargs
)
Expand Down Expand Up @@ -145,6 +153,13 @@ def __init__(
caching: bool = True,
**kwargs
):
if not _has_openai:
raise ImportError(
"Please install openai>=1.0.0 with `pip install openai>=1.0.0`!"
) from None

import tiktoken

super().__init__(
model_name_or_path=model_name_or_path,
tokenizer=tiktoken.encoding_for_model(model_name_or_path),
Expand Down
83 changes: 83 additions & 0 deletions run_debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from blendsql import blend, LLMJoin, LLMMap, LLMQA
from blendsql.db import SQLite
from blendsql.models import TransformersLLM
from blendsql.utils import fetch_from_hub
from tqdm import tqdm

# TEST_QUERIES = [
# """
# SELECT DISTINCT venue FROM w
# WHERE city = 'sydney' AND {{
# LLMMap(
# 'More than 30 total points?',
# 'w::score'
# )
# }} = TRUE
# """,
# """
# SELECT * FROM w
# WHERE city = {{
# LLMQA(
# 'Which city is located 120 miles west of Sydney?',
# (SELECT * FROM documents WHERE documents MATCH 'sydney OR 120'),
# options='w::city'
# )
# }}
# """,
# """
# SELECT date, rival, score, documents.content AS "Team Description" FROM w
# JOIN {{
# LLMJoin(
# left_on='documents::title',
# right_on='w::rival'
# )
# }}
# """
# ]

TEST_QUERIES = [
"""
SELECT title, player FROM w JOIN {{
LLMJoin(
left_on='documents::title',
right_on='w::player'
)
}}
"""
]
if __name__ == "__main__":
"""
Without cached LLM response (10 runs):
before: 3.16
after: 1.91
With cached LLM response (100 runs):
before: 0.0175
after: 0.0166
"Qwen -1.5-0.5B"
With cached LLM response (30 runs):
with fuzzy join: 0.431
without fuzzy join: 0.073
Without cached LLM response (30 runs):
with fuzzy join: 0.286
without fuzzy join: 318.85
"""
db = SQLite(fetch_from_hub("1966_NBA_Expansion_Draft_0.db"))
#db = SQLite(fetch_from_hub("multi_table.db"))
TEST_TRANSFORMERS_LLM = "hf-internal-testing/tiny-random-PhiForCausalLM"
model = TransformersLLM(TEST_TRANSFORMERS_LLM, caching=False)

times = []
for i in range(30):
for q in TEST_QUERIES:

# Make our smoothie - the executed BlendSQL script
smoothie = blend(
query=q,
db=db,
blender=model,
verbose=False,
ingredients={LLMJoin, LLMMap, LLMQA},
)
times.append(smoothie.meta.process_time_seconds)
print(smoothie.df)
print(f"Average time across {len(times)} runs: {sum(times) / len(times)}")
10 changes: 8 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def find_version(*file_paths):
url="https://github.com/parkervg/blendsql",
author="Parker Glenn",
author_email="[email protected]",
description="Query language to blend SQL logic and LLM reasoning across multi-modal data.",
description="Query language for blending SQL logic and LLM reasoning across multi-modal data.",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
license="Apache License 2.0",
Expand All @@ -46,6 +46,12 @@ def find_version(*file_paths):
"python-dotenv==1.0.1",
"sqlglot==18.13.0",
"sqlalchemy>=2.0.0",
# skrub doesn't currently support python<3.10: https://github.com/skrub-data/skrub/issues/815
"skrub==0.1.0 ; python_version>='3.10'",
# We fetch this branch which removes python 3.10 style type annotations instead, then
"skrub @ git+https://github.com/jeromedockes/skrub.git@strip-type-annotations ; python_version<'3.10'",
# https://github.com/skrub-data/skrub/issues/910
"scikit-learn==1.4.2",
"huggingface_hub",
"datasets",
"lark",
Expand Down Expand Up @@ -74,7 +80,7 @@ def find_version(*file_paths):
"recognizers-text-suite",
"emoji==1.7.0",
],
"test": ["pytest", "huggingface_hub", "pre-commit"],
"test": ["pytest", "pre-commit", "llama-cpp-python", "transformers", "torch"],
"docs": [
"mkdocs-material",
"mkdocstrings",
Expand Down

0 comments on commit edfc122

Please sign in to comment.