Enhanced search ability, updated training data.

Made google_search.py much more powerful by making it able to read text from websites. Also, increased the result text limit by a lot. Updated the large action models training data, and added a dataset for the search query generator.
HunterH1218 · Jul 4, 2024 · 99bf388 · 99bf388
1 parent 1c465c4
commit 99bf388
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 28 deletions.
diff --git a/data_files/lam_training_data.txt b/data_files/lam_training_data.txt
@@ -21,5 +21,11 @@
 "yes",
 "who is the ceo of razer?",
 "yes",
-"what should we talk about?"
-"no",
+"what should we talk about?",
+"no",
+"what are some things i can do when im bored",
+"yes",
+"newest ai model",
+"yes",
+"software versioning",
+"yes",
diff --git a/data_files/search_training_data.txt b/data_files/search_training_data.txt
@@ -0,0 +1,9 @@
+"Your job is to create a search query based on what the user said.",
+"who is the president?",
+"Who is the current president?",
+"what is the capital of france?",
+"What is the capital of France?"
+"who made the first iphone?",
+"Who made the first iPhone?",
+"who is the biggest company in tech",
+"Biggest tech company"
diff --git a/main.py b/main.py
@@ -11,7 +11,7 @@
   if query == "yes":
     print("Seaching...")
     search = sqg.generate_response(prompt)
-    results = sg.search_google(search)
+    results = sg.main(search)
     complete_result = sm.generate_response(results)
     print(complete_result)
   elif query == "no":

diff --git a/models/search_query_generator.py b/models/search_query_generator.py
@@ -3,7 +3,10 @@
 
 genai.configure(api_key=os.environ["GEMINI_API_KEY"])
 
-
+dir_path = os.path.dirname(os.path.realpath(__file__))
+file_path = os.path.join(dir_path, '..', 'data_files', 'search_training_data.txt')
+with open(file_path, 'r') as file:
+  training_data = file.read()
 
 memory = []
 
@@ -23,17 +26,10 @@
 
 def generate_response(prompt):
   response = model.generate_content([
-    "Your job is to create a search query based on what the user said.",
-    "input: who is the president?",
-    "output: Who is the current president?",
-    "input: what is the capital of france?",
-    "output: What is the capital of France?"
-    "input: who made the first iphone?",
-    "output: Who made the first iPhone?",
+    f"{training_data}",
     f"input: {prompt}",
     "output: ",
   ])
   memory.append(f"input: {prompt}")
   memory.append(f"output: {response.text}")
-
   return response.text
diff --git a/search/google_search.py b/search/google_search.py
@@ -1,21 +1,63 @@
 import requests
 from bs4 import BeautifulSoup
 
-def search_google(query):
-    search_url = "https://www.google.com/search"
-    params = {'q': query}
+def google_search(query, num_results=15):
     headers = {
-        "User-Agent": (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
-            "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
-        )
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
     }
-
-    response = requests.get(search_url, params=params, headers=headers)
-    response.raise_for_status() 
-    soup = BeautifulSoup(response.content, 'html.parser')
-    result_texts = []
-    for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
-        result_texts.append(tag.get_text())
-
-    return '\n'.join(result_texts)
+    response = requests.get(f'https://www.google.com/search?q={query}&num={num_results}', headers=headers)
+
+    if response.status_code != 200:
+        raise Exception(f"Failed to fetch search results: {response.status_code}")
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    search_results = []
+
+    for g in soup.find_all('div', class_='tF2Cxc'):
+        link_tag = g.find('a')
+        title_tag = g.find('h3')
+        description_tag = g.find('span', class_='aCOpRe')
+
+        link = link_tag['href'] if link_tag else 'No link available'
+        title = title_tag.text if title_tag else 'No title available'
+        description = description_tag.text if description_tag else 'No description available'
+
+        search_results.append({
+            'title': title,
+            'link': link,
+            'description': description
+        })
+
+    return search_results
+
+def fetch_page_text(url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # Raise an HTTPError for bad responses
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        # Extract text from paragraphs and other relevant text elements
+        paragraphs = soup.find_all('p')
+        text_content = "\n".join([p.get_text() for p in paragraphs])
+
+        return text_content
+    except Exception as e:
+        return f"Failed to fetch text from {url}: {e}"
+
+def main(query, num_results=10):
+    results = google_search(query, num_results)
+    full_results = []
+
+    for result in results:
+        page_text = fetch_page_text(result['link'])
+        full_results.append({
+            'title': result['title'],
+            'link': result['link'],
+            'description': result['description'],
+            'page_text': page_text[:20000] # Limited to max text on page
+        })
+
+    return full_results