Combined and improved your bright data dot PY and your crawl directory dot PY #5

Crh2123 · 2024-10-01T22:21:43Z

import requests
import time
import random
from bs4 import BeautifulSoup

Global variables

URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}
parsed = 0
n_URLs = 1
max_URLs = 5000

URLs base

URL_base1 = "https://mathworld.wolfram.com/topics/" # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/" # for final pages

Seed URL and Category

seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"
categoryLevel[seed_category] = 1 # Start category level

Proxy setup (optional, uncomment and modify if needed)

proxies = {

'http': 'http://username:password@proxy_url:proxy_port',

'https': 'https://username:password@proxy_url:proxy_port',

}

Validate function to filter unwanted links

def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and 'topics' not in string

Request with retries and custom headers

def get_request_with_retries(url, retries=3, timeout=5):
headers = {
'User-Agent': random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}

for i in range(retries):
    try:
        # Uncomment `proxies` argument if using proxy
        # resp = requests.get(url, timeout=timeout, proxies=proxies, headers=headers)
        resp = requests.get(url, timeout=timeout, headers=headers)
        return resp
    except requests.exceptions.RequestException as e:
        print(f"Attempt {i+1} failed for URL: {url}. Error: {e}")
        time.sleep(2 + random.uniform(0, 1.5))  # Randomized sleep
return None

Update lists of URLs and categories

def update_lists(new_URL, new_category, parent_category, file):
URL_parent_Category[new_URL] = new_category
categoryLevel[new_category] = 1 + categoryLevel[parent_category]
level = str(categoryLevel[new_category])
file.write(f"{level}\t{new_category}\t{parent_category}\n")
file.flush()

Crawling phase (Step 1)

def crawl(seed_URL, seed_category, file1, file2):
global parsed, n_URLs
URL_list.append(seed_URL)
URL_parent_Category[seed_URL] = seed_category
categoryLevel[seed_category] = 1

while parsed < min(max_URLs, n_URLs):
    URL = URL_list[parsed]
    parent_category = URL_parent_Category[URL]
    level = categoryLevel[parent_category]
    time.sleep(2 + random.uniform(0, 1.5))  # Slow down crawling
    parsed += 1

    if URL in history:
        print(f"Duplicate: {URL}")
        file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n")
    else:
        print(f"Parsing: {parsed}/{n_URLs}: {URL}")
        resp = get_request_with_retries(URL)
        if resp:
            history[URL] = resp.status_code
        else:
            history[URL] = "Error"

        if not resp or resp.status_code != 200:
            reason = resp.reason if resp else "Timeout"
            print(f"Failed: {URL} - {reason}")
            file1.write(f"{URL}\tError:{resp.status_code if resp else 'Timeout'}\t{reason}\t{parent_category}\t{level}\n")
        else:
            file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n")
            page = resp.text.replace('\n', ' ')
            soup = BeautifulSoup(page, 'html.parser')

            # Scrape intermediate directories (Type-1)
            for link in soup.find_all('a', href=True):
                href = link['href']
                if 'topics/' in href:
                    new_URL = URL_base1 + href.split("/topics/")[1]
                    new_category = link.text.strip()
                    URL_list.append(new_URL)
                    update_lists(new_URL, new_category, parent_category, file2)
                    file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n")
                    n_URLs += 1

            # Scrape final pages (Type-2)
            for link in soup.find_all('a', href=True):
                href = link['href']
                if validate(href):
                    new_URL = URL_base2 + href.split("/")[1]
                    new_category = link.text.strip()
                    final_URLs[new_URL] = (new_category, parent_category, level+1)
                    update_lists(new_URL, new_category, parent_category, file2)
                    file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n")

print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.")

Content extraction phase (Step 2)

def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()

with open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output:
    for line in Lines:
        count, URL, category = line.split("\t")[:3]
        if int(count) >= begin and int(count) <= end:
            print(f"Page {count}: {URL}")
            resp = get_request_with_retries(URL)
            if resp and resp.status_code == 200:
                page = resp.text.replace('\n', ' ')
                file_output.write(f"{URL}\t{category}\t~{page}\n")
            else:
                print(f"Error fetching {URL}: {resp.status_code if resp else 'Timeout'}")

print(f"Content extraction from {begin} to {end} completed.")

Main execution

if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)

# Extract content from final URLs (Modify begin and end as needed)
extract_content(begin=1, end=500)

# Completion message
print("All tasks completed successfully.")

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Combined and improved your bright data dot PY and your crawl directory dot PY #5

Combined and improved your bright data dot PY and your crawl directory dot PY #5

Crh2123 commented Oct 1, 2024

Combined and improved your bright data dot PY and your crawl directory dot PY #5

Combined and improved your bright data dot PY and your crawl directory dot PY #5

Comments

Crh2123 commented Oct 1, 2024

Global variables

URLs base

Seed URL and Category

Proxy setup (optional, uncomment and modify if needed)

proxies = {

'http': 'http://username:password@proxy_url:proxy_port',

'https': 'https://username:password@proxy_url:proxy_port',

}

Validate function to filter unwanted links

Request with retries and custom headers

Update lists of URLs and categories

Crawling phase (Step 1)

Content extraction phase (Step 2)

Main execution