Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added fallbacks for finding description and title #81

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 43 additions & 19 deletions googlesearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""googlesearch is a Python library for searching Google, easily."""

from time import sleep
from bs4 import BeautifulSoup
from requests import get
Expand All @@ -9,9 +10,7 @@
def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify):
resp = get(
url="https://www.google.com/search",
headers={
"User-Agent": get_useragent()
},
headers={"User-Agent": get_useragent()},
params={
"q": term,
"num": results + 2, # Prevents multiple requests
Expand All @@ -37,10 +36,12 @@ def __repr__(self):
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"


def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None):
def search(
term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None
):
"""Search the Google search engine"""

escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works.
escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works.

# Proxy
proxies = None
Expand All @@ -54,28 +55,51 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in
start = 0
while start < num_results:
# Send request
resp = _req(escaped_term, num_results - start,
lang, start, proxies, timeout, safe, ssl_verify)
resp = _req(escaped_term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify)

# Parse
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", attrs={"class": "g"})
if len(result_block) ==0:
if len(result_block) == 0:
print("Result block empty")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably shouldn't have a print here

start += 1
for result in result_block:
# Find link, title, description

# Find link. If link is not found, we skip the result because this is the only necessary output
link = result.find("a", href=True)
if not link:
continue
link = link["href"]

# Find title. If title is not found, we use the link as the title
title = result.find("h3")
description_box = result.find(
"div", {"style": "-webkit-line-clamp:2"})
if description_box:
description = description_box.text
if link and title and description:
start += 1
if advanced:
yield SearchResult(link["href"], title.text, description)
else:
yield link["href"]
if not title:
title = link
else:
title = title.text

# Find description. If description is not found, we attempt to reconstruct it by searching and combining spans.
# These messy descriptions are not perfect, but they're adequate when scraping results for AI agents.
description_box = result.find("div", {"style": "-webkit-line-clamp:2"})
description = description_box.text if description_box else None
if not description_box:
description_box_candidates = result.find_all("span")
# The description almost always has an <em> tag in it, so we use that as a heuristic to find the description
spans_with_em_child = [
candidate for candidate in description_box_candidates if candidate.find("em", recursive=False)
]
if len(spans_with_em_child) > 0:
description_box = spans_with_em_child[0]
description = description_box.text
# If we can't find an <em> tag, we just concatenate all the spans
else:
description = "".join([span.text for span in result.find_all("span")[5:]])
description = description.replace(title.text, "", 1)

if advanced:
yield SearchResult(link, title, description)
else:
yield link
sleep(sleep_interval)

if start == 0:
Expand Down