-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enhanced search ability, updated training data.
Made google_search.py much more powerful by making it able to read text from websites. Also, increased the result text limit by a lot. Updated the large action models training data, and added a dataset for the search query generator.
- Loading branch information
1 parent
1c465c4
commit 99bf388
Showing
5 changed files
with
81 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
"Your job is to create a search query based on what the user said.", | ||
"who is the president?", | ||
"Who is the current president?", | ||
"what is the capital of france?", | ||
"What is the capital of France?" | ||
"who made the first iphone?", | ||
"Who made the first iPhone?", | ||
"who is the biggest company in tech", | ||
"Biggest tech company" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,63 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
def search_google(query): | ||
search_url = "https://www.google.com/search" | ||
params = {'q': query} | ||
def google_search(query, num_results=15): | ||
headers = { | ||
"User-Agent": ( | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | ||
"(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" | ||
) | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | ||
} | ||
|
||
response = requests.get(search_url, params=params, headers=headers) | ||
response.raise_for_status() | ||
soup = BeautifulSoup(response.content, 'html.parser') | ||
result_texts = [] | ||
for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): | ||
result_texts.append(tag.get_text()) | ||
|
||
return '\n'.join(result_texts) | ||
response = requests.get(f'https://www.google.com/search?q={query}&num={num_results}', headers=headers) | ||
|
||
if response.status_code != 200: | ||
raise Exception(f"Failed to fetch search results: {response.status_code}") | ||
|
||
soup = BeautifulSoup(response.text, 'html.parser') | ||
search_results = [] | ||
|
||
for g in soup.find_all('div', class_='tF2Cxc'): | ||
link_tag = g.find('a') | ||
title_tag = g.find('h3') | ||
description_tag = g.find('span', class_='aCOpRe') | ||
|
||
link = link_tag['href'] if link_tag else 'No link available' | ||
title = title_tag.text if title_tag else 'No title available' | ||
description = description_tag.text if description_tag else 'No description available' | ||
|
||
search_results.append({ | ||
'title': title, | ||
'link': link, | ||
'description': description | ||
}) | ||
|
||
return search_results | ||
|
||
def fetch_page_text(url): | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | ||
} | ||
try: | ||
response = requests.get(url, headers=headers) | ||
response.raise_for_status() # Raise an HTTPError for bad responses | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
# Extract text from paragraphs and other relevant text elements | ||
paragraphs = soup.find_all('p') | ||
text_content = "\n".join([p.get_text() for p in paragraphs]) | ||
|
||
return text_content | ||
except Exception as e: | ||
return f"Failed to fetch text from {url}: {e}" | ||
|
||
def main(query, num_results=10): | ||
results = google_search(query, num_results) | ||
full_results = [] | ||
|
||
for result in results: | ||
page_text = fetch_page_text(result['link']) | ||
full_results.append({ | ||
'title': result['title'], | ||
'link': result['link'], | ||
'description': result['description'], | ||
'page_text': page_text[:20000] # Limited to max text on page | ||
}) | ||
|
||
return full_results |