-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
171 lines (141 loc) · 6.58 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape
import json
from datetime import datetime
def fetch_webpage_content(url):
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
)
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching the webpage: {e}")
return None
return response.text
def download_images(img_urls):
if not os.path.exists('img'):
os.makedirs('img')
for img_url in img_urls:
try:
response = requests.get(img_url, stream=True)
response.raise_for_status()
# Extract the image name from the URL
img_name = os.path.basename(img_url)
# Save the image to the 'img' folder
img_path = os.path.join('img', img_name)
with open(img_path, 'wb') as img_file:
for chunk in response.iter_content(1024):
img_file.write(chunk)
print(f"Downloaded: {img_url} -> {img_path}")
except requests.exceptions.RequestException as e:
print(f"Failed to download {img_url}: {e}")
def extract_content_and_title(html, base_url):
soup = BeautifulSoup(html, 'html.parser')
# Extract the title (h1 tag) and convert to title case
title = soup.find('h1').get_text(strip=True).title() if soup.find('h1') else "Untitled"
# Fix apostrophe capitalization issue
title = title.replace("’L", "’l").replace("’S", "’s").replace("’T","’t")
# Extract the content within #dm_content div
dm_content_div = soup.find('div', id='dm_content')
img_urls = []
if dm_content_div:
# Ensure all images have absolute URLs and modify their paths
for img in dm_content_div.find_all('img'):
if img.get('src'):
if not img['src'].startswith('http'):
img['src'] = requests.compat.urljoin(base_url, img['src'])
img_urls.append(img['src'])
# Change the image source to WordPress upload folder
img['src'] = f"/wp-content/uploads/2024/12/{os.path.basename(img['src'])}"
# Convert the div's content to HTML
content = str(dm_content_div)
else:
content = "<p>No content found.</p>"
# Extract JSON-LD structured data for publication date
date_published = ""
for script in soup.find_all('script', type='application/ld+json'):
try:
data = json.loads(script.string)
if data.get('@type') == 'BlogPosting':
date_published = data.get('datePublished', "")
break
except json.JSONDecodeError:
continue
return title, content, img_urls, date_published
def save_to_wordpress_post_xml(posts, filename_prefix="wordpress_import_posts"):
# Generate filename with current datetime
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"{filename_prefix}_{timestamp}.xml"
# Create the root element
root = ET.Element("rss", version="2.0", attrib={
"xmlns:excerpt": "http://wordpress.org/export/1.2/excerpt/",
"xmlns:content": "http://purl.org/rss/1.0/modules/content/",
"xmlns:wfw": "http://wellformedweb.org/CommentAPI/",
"xmlns:dc": "http://purl.org/dc/elements/1.1/",
"xmlns:wp": "http://wordpress.org/export/1.2/"
})
# Channel element
channel = ET.SubElement(root, "channel")
# Add site metadata
ET.SubElement(channel, "title").text = "Your WordPress Site Title"
ET.SubElement(channel, "link").text = "https://example.com"
ET.SubElement(channel, "description").text = "This is a WordPress blog post export file."
ET.SubElement(channel, "wp:wxr_version").text = "1.2"
ET.SubElement(channel, "wp:base_site_url").text = "https://example.com"
ET.SubElement(channel, "wp:base_blog_url").text = "https://example.com"
# Add posts to the channel
for post_id, (title, content, url, date_published) in enumerate(posts, start=1):
item = ET.SubElement(channel, "item")
ET.SubElement(item, "title").text = title
ET.SubElement(item, "link").text = url
ET.SubElement(item, "pubDate").text = date_published if date_published else "Mon, 27 Dec 2024 00:00:00 +0000"
ET.SubElement(item, "dc:creator").text = "admin"
ET.SubElement(item, "guid", isPermaLink="false").text = url
ET.SubElement(item, "description").text = ""
ET.SubElement(item, "content:encoded").text = f"<![CDATA[{content}]]>"
ET.SubElement(item, "excerpt:encoded").text = ""
ET.SubElement(item, "wp:post_id").text = str(post_id)
ET.SubElement(item, "wp:post_date").text = date_published.split('T')[0] if date_published else "2024-12-27 00:00:00"
ET.SubElement(item, "wp:post_date_gmt").text = date_published.split('T')[0] if date_published else "2024-12-27 00:00:00"
ET.SubElement(item, "wp:comment_status").text = "open"
ET.SubElement(item, "wp:ping_status").text = "open"
ET.SubElement(item, "wp:post_name").text = url.split('/')[-1] # Extract the slug from the URL
ET.SubElement(item, "wp:status").text = "publish"
ET.SubElement(item, "wp:post_parent").text = "0"
ET.SubElement(item, "wp:menu_order").text = "0"
ET.SubElement(item, "wp:post_type").text = "post"
ET.SubElement(item, "wp:post_password").text = ""
ET.SubElement(item, "wp:is_sticky").text = "0"
# Write to an XML file
tree = ET.ElementTree(root)
with open(filename, "wb") as xml_file:
tree.write(xml_file, encoding="utf-8", xml_declaration=True)
print(f"Content saved to {filename}")
def main():
try:
with open("list.txt", "r") as file:
urls = [line.strip() for line in file if line.strip()]
except FileNotFoundError:
print("Error: list.txt file not found.")
return
posts = []
for url in urls:
print(f"Processing URL: {url}")
html = fetch_webpage_content(url)
if html:
title, content, img_urls, date_published = extract_content_and_title(html, url)
# Download images to the 'img' folder
download_images(img_urls)
# Append post data for XML export
posts.append((title, content, url, date_published))
if posts:
save_to_wordpress_post_xml(posts)
if __name__ == "__main__":
main()