ai-cfia · JolanThomassin · Mar 7, 2024 · Nov 2, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/ailab/db/finesse/test_queries/__init__.py b/ailab/db/finesse/test_queries/__init__.py
@@ -1,12 +1,19 @@
 def get_random_chunk(cursor):
     query = """
-        SELECT dc.score AS score, cr.id AS crawl_id, ch.id AS chunk_id, ch.title, cr.url AS crawl_url, ch.text_content, ch.text_content
+        SELECT dc.score AS score, cr.id AS crawl_id, ch.id AS chunk_id, ch.title, cr.url AS crawl_url, ch.text_content
         FROM Chunk ch
         INNER JOIN html_content_to_chunk hctc ON ch.id = hctc.chunk_id
         INNER JOIN html_content hc ON hctc.md5hash = hc.md5hash
         INNER JOIN crawl cr ON hc.md5hash = cr.md5hash
         INNER JOIN documents dc ON ch.id = dc.chunk_id
-        WHERE dc.score > 0.0
+        WHERE dc.score > 0.0 
+        AND EXISTS (
+            SELECT 1 
+            FROM score sc
+            WHERE sc.entity_id = ch.id
+            AND sc.score_type = 'current' 
+            AND sc.score > 0.0
+        )
         ORDER BY RANDOM()
         LIMIT 1;
     """
@@ -15,7 +22,20 @@ def get_random_chunk(cursor):
 
 def chunk_test_quality(cursor):
     query = """
-
+        SELECT
+            ch.id AS chunk_score_id,
+            hc.md5hash AS md5hash_content_to_chunk,
+            hc.content AS html_content
+        FROM
+            louis_006.chunk_score ch
+        LEFT JOIN
+            louis_006.html_content_to_chunk hctc ON ch.id = hctc.chunk_id
+        LEFT JOIN
+            louis_006.html_content hc ON hctc.md5hash = hc.md5hash
+        WHERE
+            ch.score > 0.9
+        LIMIT
+            1;
     """
     cursor.execute(query)
     return cursor.fetchall()
diff --git a/bin/search-function-test-utilizing-llm.py b/bin/search-function-test-utilizing-llm.py
@@ -1,15 +1,16 @@
 import os
 import sys
 import json
+from datetime import date
 
 import ailab.db as db
 import ailab.db.finesse as finesse
 from ailab.models import openai
 
-from ailab.db.finesse.test_queries import get_random_chunk
+from ailab.db.finesse.test_queries import chunk_test_quality
 
-TEST_VERSION = "v001"
-WANTED_GENERATED_QUESTIONS = 10
+TEST_VERSION = date.today()
+WANTED_GENERATED_QUESTIONS = 5
 CHARACTER_LIMIT = 14383
 
 
@@ -34,7 +35,15 @@ def main():
     print("System Prompt:", system_prompt + "\n")
     print("User Prompt:", user_prompt + "\n")
 
-    average_tokens_by_chunk = 0
+    AVERAGE_TOKENS_BY_CHUNK = 0
+
+    ### WIP - TESTING NEW QUERY - WIP ###
+    with project_db.cursor() as cursor:
+        random_chunk = chunk_test_quality(cursor)
+        print(random_chunk)
+    ### WIP - TESTING NEW QUERY - WIP ###
+
+    """
     for i in range(WANTED_GENERATED_QUESTIONS):
         random_chunk = ""
 
@@ -57,9 +66,7 @@ def main():
             )
 
             total_length = len(system_prompt) + len(constructed_user_prompt)
-            print("Token limit : " + str(CHARACTER_LIMIT))
-            print("Prompt character : " + str(total_length) + "\n")
-            average_tokens_by_chunk += total_length
+            AVERAGE_TOKENS_BY_CHUNK += total_length
             if total_length < CHARACTER_LIMIT:
                 response = openai.get_chat_answer(
                     system_prompt, constructed_user_prompt, 2000
@@ -86,8 +93,9 @@ def main():
                         print("File saved into: " + file_path)
                         json.dump(data, json_file, ensure_ascii=False, indent=4)
 
-    average_tokens_by_chunk = average_tokens_by_chunk / WANTED_GENERATED_QUESTIONS
-    print("Average Tokens send to the API : " + str(average_tokens_by_chunk))
+    AVERAGE_TOKENS_BY_CHUNK = AVERAGE_TOKENS_BY_CHUNK / WANTED_GENERATED_QUESTIONS
+    print("Average Tokens send to the API : " + str(AVERAGE_TOKENS_BY_CHUNK))
+    """
 
 
 if __name__ == "__main__":

diff --git a/sql/2023-11-28-chunk-didactic-score.sql b/sql/2023-11-28-chunk-didactic-score.sql
@@ -0,0 +1,36 @@
+-- Set the search path to the louis_006 schema
+SET search_path TO louis_006;
+
+CREATE TABLE IF NOT EXISTS chunk_score (
+    id UUID,
+    score FLOAT,
+    score_type VARCHAR(50)
+);
+
+TRUNCATE TABLE chunk_score;
+
+INSERT INTO chunk_score (id, score, score_type)
+SELECT
+    ch.id, -- Use the id column from the chunk table
+    ROUND(
+        (
+            LENGTH(hc.content) - length_values.min_val
+        ) * 1.0 / (length_values.max_val - length_values.min_val),
+        1
+    ) AS tr_proportion,
+    'didactic' AS score_type
+FROM
+    louis_006.chunk ch
+INNER JOIN louis_006.html_content_to_chunk hctc ON ch.id = hctc.chunk_id
+INNER JOIN louis_006.html_content hc ON hctc.md5hash = hc.md5hash
+CROSS JOIN (
+    SELECT
+        MIN(LENGTH(content)) AS min_val,
+        MAX(LENGTH(content)) AS max_val
+    FROM
+        louis_006.chunk ch
+    INNER JOIN louis_006.html_content_to_chunk hctc ON ch.id = hctc.chunk_id
+    INNER JOIN louis_006.html_content hc ON hctc.md5hash = hc.md5hash
+) AS length_values
+ORDER BY
+    tr_proportion DESC;
diff --git a/sql/2023-11-28-create-histogram.sql b/sql/2023-11-28-create-histogram.sql
@@ -0,0 +1,6 @@
+SELECT
+  score,
+  count(*) as count
+FROM louis_006.chunk_score
+GROUP BY score
+ORDER BY score;
diff --git a/sql/2023-11-28-print-schema-table.sql b/sql/2023-11-28-print-schema-table.sql
@@ -0,0 +1,3 @@
+SELECT table_name, column_name, data_type
+FROM information_schema.columns
+WHERE table_schema = 'louis_006';