-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
58 lines (47 loc) · 2.05 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import re
from bs4 import BeautifulSoup
import requests
def normalize_wrapped_text(text):
"""
Normalize wrapped text by:
- Collapsing all whitespace into single spaces.
- Adding spaces between words that may have been joined during extraction.
"""
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
return text
def find_similar_elements(url, example_text, user_agent=None):
"""Find and extract text from elements similar to the example on a webpage."""
# Default user-agent string (if not provided)
if user_agent is None:
user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/112.0.0.0 Safari/537.36"
)
headers = {"User-Agent": user_agent}
# Fetch the HTML content from the URL with custom headers
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return
soup = BeautifulSoup(response.text, "html.parser")
# Locate the tag containing the example text
element = soup.find(string=lambda text: text and example_text in text)
if not element:
print(f"Could not find the example text: {example_text}")
return
# Get the parent tag of the example text
parent_tag = element.parent
print(f"Found example text in tag: <{parent_tag.name} class='{parent_tag.get('class')}'>\n")
# Find all elements with the same tag and class
similar_elements = soup.find_all(parent_tag.name, class_=parent_tag.get("class"))
# Normalize and print each line separately
for similar in similar_elements:
cleaned_text = normalize_wrapped_text(similar.get_text())
print(cleaned_text)
if __name__ == "__main__":
url = input("Enter the URL of the webpage: ").strip()
example_text = input("Enter the example text to search for: ").strip()
find_similar_elements(url, example_text)