From e2e0762b8e91bf0a327eac41d449fa265ca5ccb8 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Sun, 7 Jul 2024 12:14:05 +0300 Subject: [PATCH 1/6] v0.1.0a11 --- tsellm/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsellm/__version__.py b/tsellm/__version__.py index 22a9227..9558e43 100644 --- a/tsellm/__version__.py +++ b/tsellm/__version__.py @@ -1,3 +1,3 @@ __title__ = "tsellm" __description__ = "Use LLMs in SQLite and DuckDB" -__version__ = "0.1.0a10" +__version__ = "0.1.0a11" From 31f77b8134facaa4a83b8710153b3f727ffba0f5 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Mon, 8 Jul 2024 09:29:15 +0300 Subject: [PATCH 2/6] 0.1.0a12 --- tsellm/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsellm/__version__.py b/tsellm/__version__.py index 9558e43..69c8cf7 100644 --- a/tsellm/__version__.py +++ b/tsellm/__version__.py @@ -1,3 +1,3 @@ __title__ = "tsellm" __description__ = "Use LLMs in SQLite and DuckDB" -__version__ = "0.1.0a11" +__version__ = "0.1.0a12" From b45100cab1186ff75d1c7c960a5d10e95b0ab2b0 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 15 Aug 2024 13:20:24 +0300 Subject: [PATCH 3/6] json_embed implementation for SQLite --- tests/test_tsellm.py | 35 ++++++++++++++++++++++++++++++++++- tsellm/cli.py | 4 +++- tsellm/core.py | 21 +++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/tests/test_tsellm.py b/tests/test_tsellm.py index 3f0e39b..e2ded0c 100644 --- a/tests/test_tsellm.py +++ b/tests/test_tsellm.py @@ -3,7 +3,6 @@ import unittest from pathlib import Path from test.support import captured_stdout, captured_stderr, captured_stdin -from test.support.os_helper import TESTFN, unlink import duckdb import llm.cli @@ -225,6 +224,40 @@ def test_embed_hazo_binary(self): self.assertTrue(llm.get_embedding_model("hazo").supports_binary) self.expect_success(*self.path_args, "select embed(randomblob(16), 'hazo')") + def test_embed_json_recursive(self): + example_json = """{ + \"name\": \"Alice\", + \"details\": { + \"age\": 30, + \"hobbies\": [\"reading\", \"cycling\"], + \"location\": \"Wonderland\" + }, + \"greeting\": \"Hello, World!\" + }""" + out = self.expect_success( + *self.path_args, + f"select json_extract('{example_json}', '$.name')", + ) + self.assertEqual( + "('Alice',)\n", + out, + ) + + out = self.expect_success( + *self.path_args, + f"select json_embed('{example_json}', 'hazo')", + ) + self.assertEqual( + ('(\'{"name": [5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0], "details": {"age": 30, "hobbies": [[7.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [7.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], ' + '"location": [10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0]}, "greeting": [6.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}',)\n"), + out, + ) + def test_embed_default_hazo(self): self.assertEqual(llm_cli.get_default_embedding_model(), "hazo") out = self.expect_success(*self.path_args, "select embed('hello world')") diff --git a/tsellm/cli.py b/tsellm/cli.py index 80707d0..30e1f69 100644 --- a/tsellm/cli.py +++ b/tsellm/cli.py @@ -16,6 +16,7 @@ _prompt_model, _prompt_model_default, _embed_model, + _json_embed_model, _embed_model_default, ) @@ -79,6 +80,7 @@ class TsellmConsole(InteractiveConsole, ABC): ("prompt", 1, _prompt_model_default, False), ("embed", 2, _embed_model, False), ("embed", 1, _embed_model_default, False), + ("json_embed", 2, _json_embed_model, False), ] error_class = None @@ -87,7 +89,7 @@ class TsellmConsole(InteractiveConsole, ABC): @staticmethod def create_console( - fp: Union[str, Path], in_memory_type: DatabaseType = DatabaseType.UNKNOWN + fp: Union[str, Path], in_memory_type: DatabaseType = DatabaseType.UNKNOWN ): sniffer = DBSniffer(fp) if sniffer.is_in_memory: diff --git a/tsellm/core.py b/tsellm/core.py index 5f87e1f..054aff2 100644 --- a/tsellm/core.py +++ b/tsellm/core.py @@ -14,6 +14,21 @@ """ + +def json_recurse_apply(json_obj, f): + if isinstance(json_obj, dict): + # Recursively apply the function to dictionary values + return {k: json_recurse_apply(v, f) for k, v in json_obj.items()} + elif isinstance(json_obj, list): + # Recursively apply the function to list elements + return [json_recurse_apply(item, f) for item in json_obj] + elif isinstance(json_obj, str): + # Apply the function to string values, which returns a list of floats + return f(json_obj) + else: + # Return the object as is if it's neither a dictionary, list, or string + return json_obj + def _prompt_model(prompt: str, model: str) -> str: return llm.get_model(model).prompt(prompt).text() @@ -26,6 +41,12 @@ def _embed_model(text: str, model: str) -> str: return json.dumps(llm.get_embedding_model(model).embed(text)) +def _json_embed_model(js: str, model: str) -> str: + return json.dumps( + json_recurse_apply(json.loads(js), lambda v: json.loads(_embed_model(v, model))) + ) + + def _embed_model_default(text: str) -> str: return json.dumps( llm.get_embedding_model(llm_cli.get_default_embedding_model()).embed(text) From c5bb51aeab94c680b6ed57cf40a7c1502aebc0e2 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 15 Aug 2024 19:01:57 +0300 Subject: [PATCH 4/6] json_embed(json,model) for DuckDB --- tests/test_tsellm.py | 65 ++++++++++++++++++++++++++++++++------------ tsellm/cli.py | 1 + tsellm/core.py | 2 +- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/tests/test_tsellm.py b/tests/test_tsellm.py index e2ded0c..df3018c 100644 --- a/tests/test_tsellm.py +++ b/tests/test_tsellm.py @@ -174,6 +174,15 @@ def test_interact_valid_multiline_sql(self): class InMemorySQLiteTest(TsellmConsoleTest): path_args = None + alice_json = """{ + \"name\": \"Alice\", + \"details\": { + \"age\": 30, + \"hobbies\": [\"reading\", \"cycling\"], + \"location\": \"Wonderland\" + }, + \"greeting\": \"Hello, World!\" + }""" def setUp(self): super().setUp() @@ -225,18 +234,9 @@ def test_embed_hazo_binary(self): self.expect_success(*self.path_args, "select embed(randomblob(16), 'hazo')") def test_embed_json_recursive(self): - example_json = """{ - \"name\": \"Alice\", - \"details\": { - \"age\": 30, - \"hobbies\": [\"reading\", \"cycling\"], - \"location\": \"Wonderland\" - }, - \"greeting\": \"Hello, World!\" - }""" out = self.expect_success( *self.path_args, - f"select json_extract('{example_json}', '$.name')", + f"select json_extract('{self.alice_json}', '$.name')", ) self.assertEqual( "('Alice',)\n", @@ -245,16 +245,18 @@ def test_embed_json_recursive(self): out = self.expect_success( *self.path_args, - f"select json_embed('{example_json}', 'hazo')", + f"select json_embed('{self.alice_json}', 'hazo')", ) self.assertEqual( - ('(\'{"name": [5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' - '0.0, 0.0, 0.0, 0.0], "details": {"age": 30, "hobbies": [[7.0, 0.0, 0.0, 0.0, ' - '0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [7.0, 0.0, 0.0, ' - '0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], ' - '"location": [10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' - '0.0, 0.0, 0.0, 0.0]}, "greeting": [6.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' - "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}',)\n"), + ( + '(\'{"name": [5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0], "details": {"age": 30, "hobbies": [[7.0, 0.0, 0.0, 0.0, ' + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [7.0, 0.0, 0.0, " + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], " + '"location": [10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0]}, "greeting": [6.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}',)\n" + ), out, ) @@ -323,6 +325,33 @@ def test_embed_hazo_binary(self): # See https://github.com/Florents-Tselai/tsellm/issues/25 pass + def test_embed_json_recursive(self): + out = self.expect_success( + *self.path_args, + f"select '{self.alice_json}'::json -> 'name'", + ) + self.assertEqual( + "('\"Alice\"',)\n", + out, + ) + + out = self.expect_success( + *self.path_args, + f"select json_embed('{self.alice_json}'::json, 'hazo')", + ) + self.assertEqual( + ( + '(\'{"name": [5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0], "details": {"age": 30, "hobbies": [[7.0, 0.0, 0.0, 0.0, ' + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [7.0, 0.0, 0.0, " + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], " + '"location": [10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + '0.0, 0.0, 0.0, 0.0]}, "greeting": [6.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ' + "0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}',)\n" + ), + out, + ) + class DiskDuckDBTest(InMemoryDuckDBTest): db_fp = None diff --git a/tsellm/cli.py b/tsellm/cli.py index 30e1f69..3337739 100644 --- a/tsellm/cli.py +++ b/tsellm/cli.py @@ -276,6 +276,7 @@ def is_valid_db(self) -> bool: _functions = [ ("prompt", 2, _prompt_model, False), ("embed", 2, _embed_model, False), + ("json_embed", 2, _json_embed_model, False), ] def connect(self): diff --git a/tsellm/core.py b/tsellm/core.py index 054aff2..6797180 100644 --- a/tsellm/core.py +++ b/tsellm/core.py @@ -14,7 +14,6 @@ """ - def json_recurse_apply(json_obj, f): if isinstance(json_obj, dict): # Recursively apply the function to dictionary values @@ -29,6 +28,7 @@ def json_recurse_apply(json_obj, f): # Return the object as is if it's neither a dictionary, list, or string return json_obj + def _prompt_model(prompt: str, model: str) -> str: return llm.get_model(model).prompt(prompt).text() From 95e104ff916e0a5506468c76dd68de3400f8b280 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 15 Aug 2024 19:30:19 +0300 Subject: [PATCH 5/6] Update README.md --- README.md | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 04f8ae3..1576b7a 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,6 @@ [![codecov](https://codecov.io/gh/Florents-Tselai/tsellm/branch/main/graph/badge.svg)](https://codecov.io/gh/Florents-Tselai/tsellm) [![License](https://img.shields.io/badge/BSD%20license-blue.svg)](https://github.com/Florents-Tselai/tsellm/blob/main/LICENSE) - - - **tsellm** is the easiest way to access LLMs from SQLite or DuckDB. ```shell @@ -49,6 +46,45 @@ llm sentence-transformers register all-MiniLM-L12-v2 ```sql tsellm prompts.sqlite3 "select embed(p, 'sentence-transformers/all-MiniLM-L12-v2')" ``` +### Embedding `JSON` Recursively + +If you have `JSON` columns, you can embed these object recursively. +That is, an embedding vector of floats will replace each text occurrence in the object. + +```bash +cat <(sqlite3 prompts.sqlite3) | duckdb prompts.duckdb +CREATE TABLE people(d JSON); +INSERT INTO people (d) VALUES +('{"name": "John Doe", "age": 30, "hobbies": ["reading", "biking"]}'), +('{"name": "Jane Smith", "age": 25, "hobbies": ["painting", "traveling"]}') +EOF +``` + +#### SQLite + +```sql +tsellm prompts.sqlite3 "select json_embed(d, 'hazo') from people" +``` + +*Output* + +``` +('{"name": [4.0, 3.0,..., 0.0], "age": 30, "hobbies": [[7.0, 0.0,..., 0.0], [6.0, 0.0, ..., 0.0]]}',) +('{"name": [4.0, 5.0, ,..., 0.0], "age": 25, "hobbies": [[8.0, 0.0,..., 0.0], [9.0, 0.0,..., 0.0]]}',) +``` + +#### DuckDB + +```sql +tsellm prompts.duckdb "select json_embed(d, 'hazo') from people" +``` + +*Output* + +``` +('{"name": [4.0, 3.0,..., 0.0], "age": 30, "hobbies": [[7.0, 0.0,..., 0.0], [6.0, 0.0, ..., 0.0]]}',) +('{"name": [4.0, 5.0, ,..., 0.0], "age": 25, "hobbies": [[8.0, 0.0,..., 0.0], [9.0, 0.0,..., 0.0]]}',) +``` ### Embeddings for binary (`BLOB`) columns From de30285821147441a901deb62cee8644aa213dec Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 15 Aug 2024 19:37:36 +0300 Subject: [PATCH 6/6] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1576b7a..f22ca0c 100644 --- a/README.md +++ b/README.md @@ -41,11 +41,13 @@ so you can use any of its plugins: ```shell llm install llm-sentence-transformers llm sentence-transformers register all-MiniLM-L12-v2 +llm install llm-embed-hazo # dummy embedding model for demonstration purposes ``` ```sql tsellm prompts.sqlite3 "select embed(p, 'sentence-transformers/all-MiniLM-L12-v2')" ``` + ### Embedding `JSON` Recursively If you have `JSON` columns, you can embed these object recursively.