Skip to content

Commit

Permalink
Enhanced search ability, updated training data.
Browse files Browse the repository at this point in the history
Made google_search.py much more powerful by making it able to read text from websites. Also, increased the result text limit by a lot. Updated the large action models training data, and added a dataset for the search query generator.
  • Loading branch information
HunterH1218 committed Jul 4, 2024
1 parent 1c465c4 commit 99bf388
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 28 deletions.
10 changes: 8 additions & 2 deletions data_files/lam_training_data.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,11 @@
"yes",
"who is the ceo of razer?",
"yes",
"what should we talk about?"
"no",
"what should we talk about?",
"no",
"what are some things i can do when im bored",
"yes",
"newest ai model",
"yes",
"software versioning",
"yes",
9 changes: 9 additions & 0 deletions data_files/search_training_data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"Your job is to create a search query based on what the user said.",
"who is the president?",
"Who is the current president?",
"what is the capital of france?",
"What is the capital of France?"
"who made the first iphone?",
"Who made the first iPhone?",
"who is the biggest company in tech",
"Biggest tech company"
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
if query == "yes":
print("Seaching...")
search = sqg.generate_response(prompt)
results = sg.search_google(search)
results = sg.main(search)
complete_result = sm.generate_response(results)
print(complete_result)
elif query == "no":
Expand Down
14 changes: 5 additions & 9 deletions models/search_query_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

genai.configure(api_key=os.environ["GEMINI_API_KEY"])


dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(dir_path, '..', 'data_files', 'search_training_data.txt')
with open(file_path, 'r') as file:
training_data = file.read()

memory = []

Expand All @@ -23,17 +26,10 @@

def generate_response(prompt):
response = model.generate_content([
"Your job is to create a search query based on what the user said.",
"input: who is the president?",
"output: Who is the current president?",
"input: what is the capital of france?",
"output: What is the capital of France?"
"input: who made the first iphone?",
"output: Who made the first iPhone?",
f"{training_data}",
f"input: {prompt}",
"output: ",
])
memory.append(f"input: {prompt}")
memory.append(f"output: {response.text}")

return response.text
74 changes: 58 additions & 16 deletions search/google_search.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,63 @@
import requests
from bs4 import BeautifulSoup

def search_google(query):
search_url = "https://www.google.com/search"
params = {'q': query}
def google_search(query, num_results=15):
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
)
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(search_url, params=params, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
result_texts = []
for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
result_texts.append(tag.get_text())

return '\n'.join(result_texts)
response = requests.get(f'https://www.google.com/search?q={query}&num={num_results}', headers=headers)

if response.status_code != 200:
raise Exception(f"Failed to fetch search results: {response.status_code}")

soup = BeautifulSoup(response.text, 'html.parser')
search_results = []

for g in soup.find_all('div', class_='tF2Cxc'):
link_tag = g.find('a')
title_tag = g.find('h3')
description_tag = g.find('span', class_='aCOpRe')

link = link_tag['href'] if link_tag else 'No link available'
title = title_tag.text if title_tag else 'No title available'
description = description_tag.text if description_tag else 'No description available'

search_results.append({
'title': title,
'link': link,
'description': description
})

return search_results

def fetch_page_text(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an HTTPError for bad responses
soup = BeautifulSoup(response.text, 'html.parser')

# Extract text from paragraphs and other relevant text elements
paragraphs = soup.find_all('p')
text_content = "\n".join([p.get_text() for p in paragraphs])

return text_content
except Exception as e:
return f"Failed to fetch text from {url}: {e}"

def main(query, num_results=10):
results = google_search(query, num_results)
full_results = []

for result in results:
page_text = fetch_page_text(result['link'])
full_results.append({
'title': result['title'],
'link': result['link'],
'description': result['description'],
'page_text': page_text[:20000] # Limited to max text on page
})

return full_results

0 comments on commit 99bf388

Please sign in to comment.