Skip to content

Commit

Permalink
html scrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
kris-araptus committed Sep 23, 2024
1 parent 208c164 commit 392d9ac
Showing 1 changed file with 41 additions and 0 deletions.
41 changes: 41 additions & 0 deletions html_scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import requests
from bs4 import BeautifulSoup
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def scrape_inner_html(url, output_file):
try:
# Fetch the webpage
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes
soup = BeautifulSoup(response.text, 'html.parser')

# Extract the main content (assuming it's inside a div with a class like 'content' or 'main')
main_content = soup.find('div', class_='content') # Adjust this based on the website structure
if not main_content:
logger.warning(f"Main content not found for {url}")
return

# Optionally, remove header and footer if they exist inside the main content
for unwanted in main_content.find_all(['header', 'footer']):
unwanted.extract()

# Convert the main content back to a string (inner HTML)
inner_html = str(main_content)

# Save the HTML content to a file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(inner_html)

logger.info(f"Inner HTML saved to {output_file}")

except requests.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")

# Usage
scrape_inner_html('https://example.com/page', 'output.html')

0 comments on commit 392d9ac

Please sign in to comment.