From 392d9ac6857ff8f2e696a205107ee5a87833ba01 Mon Sep 17 00:00:00 2001 From: Kristopher Black Date: Mon, 23 Sep 2024 17:37:52 -0500 Subject: [PATCH] html scrapper --- html_scrapper.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 html_scrapper.py diff --git a/html_scrapper.py b/html_scrapper.py new file mode 100644 index 0000000..cad2693 --- /dev/null +++ b/html_scrapper.py @@ -0,0 +1,41 @@ +import requests +from bs4 import BeautifulSoup +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def scrape_inner_html(url, output_file): + try: + # Fetch the webpage + response = requests.get(url) + response.raise_for_status() # Raise an exception for bad status codes + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract the main content (assuming it's inside a div with a class like 'content' or 'main') + main_content = soup.find('div', class_='content') # Adjust this based on the website structure + if not main_content: + logger.warning(f"Main content not found for {url}") + return + + # Optionally, remove header and footer if they exist inside the main content + for unwanted in main_content.find_all(['header', 'footer']): + unwanted.extract() + + # Convert the main content back to a string (inner HTML) + inner_html = str(main_content) + + # Save the HTML content to a file + with open(output_file, 'w', encoding='utf-8') as f: + f.write(inner_html) + + logger.info(f"Inner HTML saved to {output_file}") + + except requests.RequestException as e: + logger.error(f"Error fetching {url}: {e}") + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + +# Usage +scrape_inner_html('https://example.com/page', 'output.html') \ No newline at end of file