Skip to content

Commit

Permalink
v1.2.2
Browse files Browse the repository at this point in the history
Should have commited this long ago.
  • Loading branch information
vintol committed Mar 31, 2019
1 parent 403b34b commit eabb509
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 36 deletions.
26 changes: 25 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,34 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).

------------
## [Upcoming Release]
## [1.2.1] - 2018-01-27
### Added
- Capability for handling https protocol.
### Deprecated
- Forcing usage of http protocol because ssl certificate issues
### Security
- HTTPS certificate is not verified.


---------
## [1.2.0] - 2018-11-21
### Added
- Stop scraping at a perticular image
- Start from or finish at a perticular page no.
### Changed
- Report Broken links.
### Fixed
- Error handling exception when http timeout

-------
## [1.1.0] - 2017-03-29

### Added
- Downloader always checks if files are already downloaded and skips them.
- Errors are categorized for better understanding of bugs.
- Checks for broken links.

----
## [1.0.0] - 2016-11-10
First Stable release.
### Added
Expand Down
4 changes: 2 additions & 2 deletions L-dl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# Listal Downloader
#
#
# vintol.github.io/listal
# v1.2

import urllib.request,ssl
import argparse
Expand Down
82 changes: 49 additions & 33 deletions Listal.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Listal.py
# 08/11/2016 - 2017-04-13
#
# 27/01/2018
# v 1.2.2

import urllib.request, urllib.parse
import urllib.request, urllib.parse, ssl
import bs4
import queue
import threading
Expand All @@ -11,6 +11,7 @@
import sys
import argparse
import time
import better_exceptions

# Scrapers

Expand All @@ -19,40 +20,54 @@ def get_ipages():
while not qq.empty():
local = threading.local()
local.url = qq.get()
local.keep_going = True
local.skip = False
if STOP_AT is not None and int(local.url.split('//')[2]) > STOP_AT:continue
try:local.html = urllib.request.urlopen(local.url,timeout=2)
except urllib.error.HTTPError as HERR:
if HERR.code == 404:continue
except:
while True:
local.html = urllib.request.urlopen(local.url,timeout=100)
if local.html.getcode() == 200:break
else:continue
try:
local.data = local.html.read()
local.soup = bs4.BeautifulSoup(local.data,'lxml')
for each in local.soup.find_all('div','imagewrap-inner'):
local.img = int(each.a.get('href').strip().split('/')[-1])
if IMG is None:ipages.append(local.img)
elif local.img > IMG:ipages.append(local.img)
elif local.img == IMG:STOP_AT = int(local.url.split('//')[2])
else:pass
except:qq.put(local.url)
while local.keep_going:
try:local.html = urllib.request.urlopen(local.url,timeout=10)
except urllib.error.HTTPError as HERR:
if HERR.code == 404:
local.keep_going = False
local.skip = True
continue
except:continue
if local.html.getcode() == 200:local.keep_going = False
if local.skip:continue
local.data = local.html.read()
local.soup = bs4.BeautifulSoup(local.data,'lxml')
for each in local.soup.find_all('div','imagewrap-inner'):
local.img = int(each.a.get('href').strip().split('/')[-1])
if IMG is None:ipages.append(local.img)
elif local.img > IMG:ipages.append(local.img)
elif local.img == IMG:STOP_AT = int(local.url.split('//')[2])
else:pass

def get_images():
while not qq.empty():
local = threading.local()
local.url = qq.get()
try:local.html = urllib.request.urlopen(local.url,timeout=10)
except urllib.error.HTTPError as HERR:
if HERR.code == 404:continue
except:local.html = urllib.request.urlopen(local.url,timeout=25)
local.keep_going = True
local.skip = True
local.retry = 0
while local.keep_going and local.retry < 5:
try:
local.retry += 1
local.html = urllib.request.urlopen(local.url,timeout=25)
if local.html.getcode() == 200:
local.keep_going = False
local.skip = False
except urllib.error.HTTPError as HERR:
if HERR is not None and HERR.code == 404:
local.keep_going = False
continue
except:continue
if local.skip:continue
for i in range(2):
try:
local.data = local.html.read()
break
images.append(find_image(local.data))
except:continue
images.append(find_image(local.data))
break

# Functions

Expand All @@ -61,16 +76,16 @@ def mksoup(url):
return bs4.BeautifulSoup(tmp.read(),"lxml")

def find_image(data):
return bs4.BeautifulSoup(data,"lxml").find('img','pure-img').get('src')
return bs4.BeautifulSoup(data,"lxml").find('img','pure-img').get('src').replace("https:","http:")

def post_req():
tmp = urllib.parse.urlencode({ 'listid' : list_id , 'offset' : offset})
return urllib.request.urlopen("http://www.listal.com/item-list/",tmp.encode())
return urllib.request.urlopen("https://www.listal.com/item-list/",tmp.encode())

def mkqueue(url):
global no_pics,no_pages
no_pics = int(mksoup(url).find('a','picturesbutton').span.text.strip())
no_pages = no_pics/20
no_pages = no_pics/50
if no_pages.is_integer():no_pages = int(no_pages)
else:no_pages = int(no_pages) + 1
for i in range(int(args.first_page),no_pages+1):qq.put(url+"/pictures//"+str(i))
Expand All @@ -91,7 +106,7 @@ def stop_at(IMG):

def update_progress():
progress = 100 - int((100*qq.qsize()) / len(ipages))
pbar = "\r {:0>3}% [{:<50}] ({},{})".format(progress, '#'*int((progress/2)), (len(ipages)-qq.qsize()), len(ipages))
pbar = "\r {:0>3}% [{:<50}] ({},{}) ".format(progress, '#'*int((progress/2)), (len(ipages)-qq.qsize()), len(ipages))
sys.stdout.write(pbar)
sys.stdout.flush()

Expand Down Expand Up @@ -172,8 +187,9 @@ def write():
t.start()
for t in threads:t.join()


print("Time Taken :",time.strftime("%H:%M:%S",time.gmtime(time.time()-started))) #DEBUG
print("Phase I Complete.",len(ipages),"Images Found.")
print("Time Taken :",time.strftime("%H:%M:%S",time.gmtime(time.time()-started)))
print("Phase II :")
enqueue()
threads.clear()
for n in range(args.threads):
Expand Down

0 comments on commit eabb509

Please sign in to comment.