You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and 'topics' not in string
Request with retries and custom headers
def get_request_with_retries(url, retries=3, timeout=5):
headers = {
'User-Agent': random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}
for i in range(retries):
try:
# Uncomment `proxies` argument if using proxy
# resp = requests.get(url, timeout=timeout, proxies=proxies, headers=headers)
resp = requests.get(url, timeout=timeout, headers=headers)
return resp
except requests.exceptions.RequestException as e:
print(f"Attempt {i+1} failed for URL: {url}. Error: {e}")
time.sleep(2 + random.uniform(0, 1.5)) # Randomized sleep
return None
while parsed < min(max_URLs, n_URLs):
URL = URL_list[parsed]
parent_category = URL_parent_Category[URL]
level = categoryLevel[parent_category]
time.sleep(2 + random.uniform(0, 1.5)) # Slow down crawling
parsed += 1
if URL in history:
print(f"Duplicate: {URL}")
file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n")
else:
print(f"Parsing: {parsed}/{n_URLs}: {URL}")
resp = get_request_with_retries(URL)
if resp:
history[URL] = resp.status_code
else:
history[URL] = "Error"
if not resp or resp.status_code != 200:
reason = resp.reason if resp else "Timeout"
print(f"Failed: {URL} - {reason}")
file1.write(f"{URL}\tError:{resp.status_code if resp else 'Timeout'}\t{reason}\t{parent_category}\t{level}\n")
else:
file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n")
page = resp.text.replace('\n', ' ')
soup = BeautifulSoup(page, 'html.parser')
# Scrape intermediate directories (Type-1)
for link in soup.find_all('a', href=True):
href = link['href']
if 'topics/' in href:
new_URL = URL_base1 + href.split("/topics/")[1]
new_category = link.text.strip()
URL_list.append(new_URL)
update_lists(new_URL, new_category, parent_category, file2)
file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n")
n_URLs += 1
# Scrape final pages (Type-2)
for link in soup.find_all('a', href=True):
href = link['href']
if validate(href):
new_URL = URL_base2 + href.split("/")[1]
new_category = link.text.strip()
final_URLs[new_URL] = (new_category, parent_category, level+1)
update_lists(new_URL, new_category, parent_category, file2)
file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n")
print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.")
Content extraction phase (Step 2)
def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()
with open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output:
for line in Lines:
count, URL, category = line.split("\t")[:3]
if int(count) >= begin and int(count) <= end:
print(f"Page {count}: {URL}")
resp = get_request_with_retries(URL)
if resp and resp.status_code == 200:
page = resp.text.replace('\n', ' ')
file_output.write(f"{URL}\t{category}\t~{page}\n")
else:
print(f"Error fetching {URL}: {resp.status_code if resp else 'Timeout'}")
print(f"Content extraction from {begin} to {end} completed.")
Main execution
if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)
# Extract content from final URLs (Modify begin and end as needed)
extract_content(begin=1, end=500)
# Completion message
print("All tasks completed successfully.")
The text was updated successfully, but these errors were encountered:
import requests
import time
import random
from bs4 import BeautifulSoup
Global variables
URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}
parsed = 0
n_URLs = 1
max_URLs = 5000
URLs base
URL_base1 = "https://mathworld.wolfram.com/topics/" # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/" # for final pages
Seed URL and Category
seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"
categoryLevel[seed_category] = 1 # Start category level
Proxy setup (optional, uncomment and modify if needed)
proxies = {
'http': 'http://username:password@proxy_url:proxy_port',
'https': 'https://username:password@proxy_url:proxy_port',
}
Validate function to filter unwanted links
def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and 'topics' not in string
Request with retries and custom headers
def get_request_with_retries(url, retries=3, timeout=5):
headers = {
'User-Agent': random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}
Update lists of URLs and categories
def update_lists(new_URL, new_category, parent_category, file):
URL_parent_Category[new_URL] = new_category
categoryLevel[new_category] = 1 + categoryLevel[parent_category]
level = str(categoryLevel[new_category])
file.write(f"{level}\t{new_category}\t{parent_category}\n")
file.flush()
Crawling phase (Step 1)
def crawl(seed_URL, seed_category, file1, file2):
global parsed, n_URLs
URL_list.append(seed_URL)
URL_parent_Category[seed_URL] = seed_category
categoryLevel[seed_category] = 1
Content extraction phase (Step 2)
def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()
Main execution
if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)
The text was updated successfully, but these errors were encountered: