diff --git a/data_files/lam_training_data.txt b/data_files/lam_training_data.txt index 02143bc..a28c1a5 100644 --- a/data_files/lam_training_data.txt +++ b/data_files/lam_training_data.txt @@ -21,5 +21,11 @@ "yes", "who is the ceo of razer?", "yes", -"what should we talk about?" -"no", \ No newline at end of file +"what should we talk about?", +"no", +"what are some things i can do when im bored", +"yes", +"newest ai model", +"yes", +"software versioning", +"yes", \ No newline at end of file diff --git a/data_files/search_training_data.txt b/data_files/search_training_data.txt new file mode 100644 index 0000000..4829df5 --- /dev/null +++ b/data_files/search_training_data.txt @@ -0,0 +1,9 @@ +"Your job is to create a search query based on what the user said.", +"who is the president?", +"Who is the current president?", +"what is the capital of france?", +"What is the capital of France?" +"who made the first iphone?", +"Who made the first iPhone?", +"who is the biggest company in tech", +"Biggest tech company" \ No newline at end of file diff --git a/main.py b/main.py index 735f5da..75d4987 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,7 @@ if query == "yes": print("Seaching...") search = sqg.generate_response(prompt) - results = sg.search_google(search) + results = sg.main(search) complete_result = sm.generate_response(results) print(complete_result) elif query == "no": diff --git a/models/search_query_generator.py b/models/search_query_generator.py index 248c89b..02ef1aa 100644 --- a/models/search_query_generator.py +++ b/models/search_query_generator.py @@ -3,7 +3,10 @@ genai.configure(api_key=os.environ["GEMINI_API_KEY"]) - +dir_path = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(dir_path, '..', 'data_files', 'search_training_data.txt') +with open(file_path, 'r') as file: + training_data = file.read() memory = [] @@ -23,17 +26,10 @@ def generate_response(prompt): response = model.generate_content([ - "Your job is to create a search query based on what the user said.", - "input: who is the president?", - "output: Who is the current president?", - "input: what is the capital of france?", - "output: What is the capital of France?" - "input: who made the first iphone?", - "output: Who made the first iPhone?", + f"{training_data}", f"input: {prompt}", "output: ", ]) memory.append(f"input: {prompt}") memory.append(f"output: {response.text}") - return response.text diff --git a/search/google_search.py b/search/google_search.py index d4aae09..3eb1f63 100644 --- a/search/google_search.py +++ b/search/google_search.py @@ -1,21 +1,63 @@ import requests from bs4 import BeautifulSoup -def search_google(query): - search_url = "https://www.google.com/search" - params = {'q': query} +def google_search(query, num_results=15): headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" - ) + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } - - response = requests.get(search_url, params=params, headers=headers) - response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') - result_texts = [] - for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']): - result_texts.append(tag.get_text()) - - return '\n'.join(result_texts) + response = requests.get(f'https://www.google.com/search?q={query}&num={num_results}', headers=headers) + + if response.status_code != 200: + raise Exception(f"Failed to fetch search results: {response.status_code}") + + soup = BeautifulSoup(response.text, 'html.parser') + search_results = [] + + for g in soup.find_all('div', class_='tF2Cxc'): + link_tag = g.find('a') + title_tag = g.find('h3') + description_tag = g.find('span', class_='aCOpRe') + + link = link_tag['href'] if link_tag else 'No link available' + title = title_tag.text if title_tag else 'No title available' + description = description_tag.text if description_tag else 'No description available' + + search_results.append({ + 'title': title, + 'link': link, + 'description': description + }) + + return search_results + +def fetch_page_text(url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + try: + response = requests.get(url, headers=headers) + response.raise_for_status() # Raise an HTTPError for bad responses + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract text from paragraphs and other relevant text elements + paragraphs = soup.find_all('p') + text_content = "\n".join([p.get_text() for p in paragraphs]) + + return text_content + except Exception as e: + return f"Failed to fetch text from {url}: {e}" + +def main(query, num_results=10): + results = google_search(query, num_results) + full_results = [] + + for result in results: + page_text = fetch_page_text(result['link']) + full_results.append({ + 'title': result['title'], + 'link': result['link'], + 'description': result['description'], + 'page_text': page_text[:20000] # Limited to max text on page + }) + + return full_results \ No newline at end of file