-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
208c164
commit 392d9ac
Showing
1 changed file
with
41 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import logging | ||
|
||
# Set up logging | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
def scrape_inner_html(url, output_file): | ||
try: | ||
# Fetch the webpage | ||
response = requests.get(url) | ||
response.raise_for_status() # Raise an exception for bad status codes | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
# Extract the main content (assuming it's inside a div with a class like 'content' or 'main') | ||
main_content = soup.find('div', class_='content') # Adjust this based on the website structure | ||
if not main_content: | ||
logger.warning(f"Main content not found for {url}") | ||
return | ||
|
||
# Optionally, remove header and footer if they exist inside the main content | ||
for unwanted in main_content.find_all(['header', 'footer']): | ||
unwanted.extract() | ||
|
||
# Convert the main content back to a string (inner HTML) | ||
inner_html = str(main_content) | ||
|
||
# Save the HTML content to a file | ||
with open(output_file, 'w', encoding='utf-8') as f: | ||
f.write(inner_html) | ||
|
||
logger.info(f"Inner HTML saved to {output_file}") | ||
|
||
except requests.RequestException as e: | ||
logger.error(f"Error fetching {url}: {e}") | ||
except Exception as e: | ||
logger.error(f"An unexpected error occurred: {e}") | ||
|
||
# Usage | ||
scrape_inner_html('https://example.com/page', 'output.html') |