-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy paththeoatmeal.py
82 lines (66 loc) · 2.68 KB
/
theoatmeal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#-------------------------------------------------------------------------------
# Name: theoatmeal downloader
# Purpose: Download all comics from theoatmeal.com
#
# Author: Manoj | Edited by Parin Vachhani
#
#-------------------------------------------------------------------------------
from bs4 import BeautifulSoup
import urllib
import os
import sys
dir = os.path.dirname(os.path.abspath(__file__))
oatmealdir = dir +"/OatmealComics"
if not os.path.exists(oatmealdir):
os.makedirs(oatmealdir)
for url_range in range(1,15):
main_url = "http://theoatmeal.com/comics_pg/page:" + str(url_range)
print "Entered Page " + str(url_range)
main_url_opener = urllib.urlopen(main_url)
main_url_response = main_url_opener.read()
main_url_soup = BeautifulSoup(main_url_response,"lxml")
mylist = []
for comiclink in main_url_soup.find_all('a'):
all_links = comiclink.get('href')
split_links = all_links.split('/')
try:
if split_links[1]=="comics" and split_links[2]!="":
if all_links not in mylist:
mylist.append(all_links)
except:
pass
for element in mylist:
old_source = element
new_source = old_source.replace('/comics/','http://theoatmeal.com/comics/')
#do download stuff here
url = new_source
opener = urllib.urlopen(url)
response = opener.read()
soupedversion = BeautifulSoup(response,"lxml")
comicname = soupedversion.title.string
comicname = comicname.replace('?','')
comicname = comicname.replace(':','')
comicname = comicname.replace('*','')
comicname = comicname.replace('"','')
comicdir = dir +"/OatmealComics/"+ comicname
if not os.path.exists(comicdir):
print " Downloading "+comicname
os.makedirs(comicdir)
else:
if not len(os.listdir(comicdir)) == 0:
print "Neglected "+comicname+" because it already exists in your directory."
continue
else:
print " Downloading "+comicname
for imglink in soupedversion.find_all('img'):
mylink = imglink.get('src')
current_comic_src = mylink.split('/')
if current_comic_src[4] == "comics":
open_img = urllib.urlopen(mylink)
img_data = open_img.read()
filename = current_comic_src[6]
filename = filename.replace('?reload','')
path = os.path.join(comicdir,filename)
with open (path,"wb") as data:
data.write(img_data)
print "Completed Download of Comic :"+comicname