Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combined and improved your bright data dot PY and your crawl directory dot PY #5

Open
Crh2123 opened this issue Oct 1, 2024 · 0 comments

Comments

@Crh2123
Copy link

Crh2123 commented Oct 1, 2024

import requests
import time
import random
from bs4 import BeautifulSoup

Global variables

URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}
parsed = 0
n_URLs = 1
max_URLs = 5000

URLs base

URL_base1 = "https://mathworld.wolfram.com/topics/" # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/" # for final pages

Seed URL and Category

seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"
categoryLevel[seed_category] = 1 # Start category level

Proxy setup (optional, uncomment and modify if needed)

proxies = {

'http': 'http://username:password@proxy_url:proxy_port',

'https': 'https://username:password@proxy_url:proxy_port',

}

Validate function to filter unwanted links

def validate(string):
Ignore = ['about/', 'classroom/', 'contact/', 'whatsnew/', 'letters/']
return len(string) <= 60 and string not in Ignore and 'topics' not in string

Request with retries and custom headers

def get_request_with_retries(url, retries=3, timeout=5):
headers = {
'User-Agent': random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
])
}

for i in range(retries):
    try:
        # Uncomment `proxies` argument if using proxy
        # resp = requests.get(url, timeout=timeout, proxies=proxies, headers=headers)
        resp = requests.get(url, timeout=timeout, headers=headers)
        return resp
    except requests.exceptions.RequestException as e:
        print(f"Attempt {i+1} failed for URL: {url}. Error: {e}")
        time.sleep(2 + random.uniform(0, 1.5))  # Randomized sleep
return None

Update lists of URLs and categories

def update_lists(new_URL, new_category, parent_category, file):
URL_parent_Category[new_URL] = new_category
categoryLevel[new_category] = 1 + categoryLevel[parent_category]
level = str(categoryLevel[new_category])
file.write(f"{level}\t{new_category}\t{parent_category}\n")
file.flush()

Crawling phase (Step 1)

def crawl(seed_URL, seed_category, file1, file2):
global parsed, n_URLs
URL_list.append(seed_URL)
URL_parent_Category[seed_URL] = seed_category
categoryLevel[seed_category] = 1

while parsed < min(max_URLs, n_URLs):
    URL = URL_list[parsed]
    parent_category = URL_parent_Category[URL]
    level = categoryLevel[parent_category]
    time.sleep(2 + random.uniform(0, 1.5))  # Slow down crawling
    parsed += 1

    if URL in history:
        print(f"Duplicate: {URL}")
        file1.write(f"{URL}\tDuplicate\t{parent_category}\t{level}\n")
    else:
        print(f"Parsing: {parsed}/{n_URLs}: {URL}")
        resp = get_request_with_retries(URL)
        if resp:
            history[URL] = resp.status_code
        else:
            history[URL] = "Error"

        if not resp or resp.status_code != 200:
            reason = resp.reason if resp else "Timeout"
            print(f"Failed: {URL} - {reason}")
            file1.write(f"{URL}\tError:{resp.status_code if resp else 'Timeout'}\t{reason}\t{parent_category}\t{level}\n")
        else:
            file1.write(f"{URL}\tParsed\t{parent_category}\t{level}\n")
            page = resp.text.replace('\n', ' ')
            soup = BeautifulSoup(page, 'html.parser')

            # Scrape intermediate directories (Type-1)
            for link in soup.find_all('a', href=True):
                href = link['href']
                if 'topics/' in href:
                    new_URL = URL_base1 + href.split("/topics/")[1]
                    new_category = link.text.strip()
                    URL_list.append(new_URL)
                    update_lists(new_URL, new_category, parent_category, file2)
                    file1.write(f"{new_URL}\tQueued\t{new_category}\t{level+1}\n")
                    n_URLs += 1

            # Scrape final pages (Type-2)
            for link in soup.find_all('a', href=True):
                href = link['href']
                if validate(href):
                    new_URL = URL_base2 + href.split("/")[1]
                    new_category = link.text.strip()
                    final_URLs[new_URL] = (new_category, parent_category, level+1)
                    update_lists(new_URL, new_category, parent_category, file2)
                    file1.write(f"{new_URL}\tEndNode\t{new_category}\t{level+1}\n")

print(f"Crawling completed. Parsed {parsed} URLs out of {n_URLs}.")

Content extraction phase (Step 2)

def extract_content(begin, end):
with open("list_final_URLs.txt", "r", encoding="utf-8") as file_input:
Lines = file_input.readlines()

with open(f"crawl_final_{begin}_{end}.txt", "w", encoding="utf-8") as file_output:
    for line in Lines:
        count, URL, category = line.split("\t")[:3]
        if int(count) >= begin and int(count) <= end:
            print(f"Page {count}: {URL}")
            resp = get_request_with_retries(URL)
            if resp and resp.status_code == 200:
                page = resp.text.replace('\n', ' ')
                file_output.write(f"{URL}\t{category}\t~{page}\n")
            else:
                print(f"Error fetching {URL}: {resp.status_code if resp else 'Timeout'}")

print(f"Content extraction from {begin} to {end} completed.")

Main execution

if name == "main":
# Open files for logging
with open("crawl_log.txt", "w", encoding="utf-8") as file1, open("crawl_categories.txt", "w", encoding="utf-8") as file2:
crawl(seed_URL="https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html", seed_category="Probability and Statistics", file1=file1, file2=file2)

# Extract content from final URLs (Modify begin and end as needed)
extract_content(begin=1, end=500)

# Completion message
print("All tasks completed successfully.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant