From 8d2e67f5d0a7b2a4b811bcf13ec271f2f8bf7416 Mon Sep 17 00:00:00 2001
From: Jolan Thomassin <98430140+JolanThomassin@users.noreply.github.com>
Date: Tue, 12 Dec 2023 19:21:31 +0000
Subject: [PATCH] Fixes #9, code clarification

---
 ailab/db/finesse/test_queries/__init__.py | 23 +------
 bin/search-function-test-utilizing-llm.py | 81 +++++++++++++++++------
 2 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/ailab/db/finesse/test_queries/__init__.py b/ailab/db/finesse/test_queries/__init__.py
index b9fdd43..271ac5f 100644
--- a/ailab/db/finesse/test_queries/__init__.py
+++ b/ailab/db/finesse/test_queries/__init__.py
@@ -13,7 +13,7 @@ def get_random_chunk(cursor):
         INNER JOIN
             "louis_0.0.6".crawl cr ON hc.md5hash = cr.md5hash
         WHERE
-            dc.score > 0.7 
+            dc.score > 0.01
         ORDER BY
             RANDOM()
         LIMIT
@@ -21,24 +21,3 @@ def get_random_chunk(cursor):
     """
     cursor.execute(query)
     return cursor.fetchall()
-
-
-def to_delete_fct(cursor):
-    query = """
-        SELECT
-            ch.id AS chunk_score_id,
-            hc.md5hash AS md5hash_content_to_chunk,
-            hc.content AS html_content
-        FROM
-            "louis_0.0.6".chunk_score ch
-        LEFT JOIN
-            "louis_0.0.6".html_content_to_chunk hctc ON ch.id = hctc.chunk_id
-        LEFT JOIN
-            "louis_0.0.6".html_content hc ON hctc.md5hash = hc.md5hash
-        WHERE
-            ch.score > 0.9
-        LIMIT
-            1;
-    """
-    cursor.execute(query)
-    return cursor.fetchall()
diff --git a/bin/search-function-test-utilizing-llm.py b/bin/search-function-test-utilizing-llm.py
index ac6743b..b6cadfa 100644
--- a/bin/search-function-test-utilizing-llm.py
+++ b/bin/search-function-test-utilizing-llm.py
@@ -1,3 +1,18 @@
+"""
+Script Purpose:
+This script generates questions based on provided prompts
+and stores the responses as JSON files.
+It interacts with the AI model to create questions
+and saves the relevant data for each question in a JSON file.
+
+Usage:
+./search-function-test-utilizing-llm.sh PROMPT_PATH
+
+Parameters:
+- PROMPT_PATH: Directory containing the API prompt files
+(qna_system_prompt.txt, qna_user_prompt.txt, and JSON template)
+"""
+
 import os
 import sys
 import json
@@ -11,7 +26,7 @@
 
 # Constants
 TEST_VERSION = date.today()
-REQUIRED_QUESTIONS = 1
+REQUIRED_QUESTIONS = 50
 CHARACTER_LIMIT = 14383
 STORAGE_PATH = "/home/vscode/finesse-data-2/qna"
 
@@ -26,7 +41,7 @@ def load_prompts_and_template(prompt_path):
 
 
 def construct_user_prompt(user_prompt, random_chunk_str, json_template):
-    """Constructs the user prompt using the user prompt, random chunk and json template"""
+    """Constructs the user prompt using prompt, chunk and json template"""
     return (
         f"{user_prompt}\n\nHere is the JSON containing the search:\n{random_chunk_str}"
         f"\n\nAnd here is the JSON template:\n{json_template}"
@@ -42,26 +57,54 @@ def generate_question(system_prompt, user_prompt, json_template, project_db):
             if not random_chunk:
                 print("No chunk found in the database.")
                 sys.exit(1)  # exit the program if chunk is empty
-
-        constructed_user_prompt = construct_user_prompt(
-            user_prompt, str(random_chunk), json_template
-        )
-        total_length = len(system_prompt) + len(constructed_user_prompt)
-        average_tokens += total_length
-
-        if total_length < CHARACTER_LIMIT:
-            response = openai.get_chat_answer(
-                system_prompt, constructed_user_prompt, 2000
-            )
-            data = json.loads(response.choices[0].message.content)
-            if isinstance(data, dict):
-                for chunk in random_chunk:
-                    data["text_content"] = chunk["text_content"]
-                save_response_to_file(data)
+            
+            chunk_title = ""
+            for chunk in random_chunk:
+                chunk_title = chunk["title"]
+
+            ### TO REMOVE ###
+            words_to_check = [
+                "This page is part",
+                "Cette page fait partie",
+                "Archivée",
+                "archivée",
+                "Archived",
+                "archived"
+            ]
+            
+            found_words = []
+            
+            for word in words_to_check:
+                if word.lower() in chunk_title.lower():
+                    found_words.append(word)
+            
+            if found_words:
+                print("The following words were found in the string:")
+                for found_word in found_words:
+                    print("-", found_word)
+                print("Skipping...")
+            else:
+            ### TO REMOVE ###
+
+                constructed_user_prompt = construct_user_prompt(
+                    user_prompt, str(random_chunk), json_template
+                )
+                total_length = len(system_prompt) + len(constructed_user_prompt)
+                average_tokens += total_length
+
+                if total_length < CHARACTER_LIMIT:
+                    response = openai.get_chat_answer(
+                        system_prompt, constructed_user_prompt, 2000
+                    )
+                    data = json.loads(response.choices[0].message.content)
+                    if isinstance(data, dict):
+                        for chunk in random_chunk:
+                            data["text_content"] = chunk["text_content"]
+                        save_response_to_file(data)
 
     return average_tokens / REQUIRED_QUESTIONS
 
-
+ 
 def save_response_to_file(data):
     """Saves the provided data to a new file"""
     file_number = 1