From f44c940509d1cc21a4e35e39f7f7a8f542763579 Mon Sep 17 00:00:00 2001 From: vintol Date: Sun, 31 Mar 2019 13:40:30 +0530 Subject: [PATCH 1/2] Delete Listal.py --- Listal.py | 196 ------------------------------------------------------ 1 file changed, 196 deletions(-) delete mode 100644 Listal.py diff --git a/Listal.py b/Listal.py deleted file mode 100644 index a7d9b96..0000000 --- a/Listal.py +++ /dev/null @@ -1,196 +0,0 @@ -# Listal.py -# 08/11/2016 -# - -import urllib.request, urllib.parse -import bs4 -import queue -import threading -import re -import os -import sys -import argparse -import time - -# Scrapers - -def get_ipages(): - global IMG, STOP_AT - while not qq.empty(): - local = threading.local() - local.url = qq.get() - if STOP_AT is not None and int(local.url.split('//')[2]) > STOP_AT:continue - try:local.html = urllib.request.urlopen(local.url,timeout=2) - except urllib.error.HTTPError as HERR: - if HERR.code == 404:continue - except: - while True: - local.html = urllib.request.urlopen(local.url,timeout=100) - if local.html.getcode() == 200:break - else:continue - try: - local.data = local.html.read() - local.soup = bs4.BeautifulSoup(local.data,'lxml') - for each in local.soup.find_all('div','imagewrap-inner'): - local.img = int(each.a.get('href').strip().split('/')[-1]) - if IMG is None:ipages.append(local.img) - elif local.img > IMG:ipages.append(local.img) - elif local.img == IMG:STOP_AT = int(local.url.split('//')[2]) - else:pass - except:qq.put(local.url) - -def get_images(): - while not qq.empty(): - local = threading.local() - local.url = qq.get() - try:local.html = urllib.request.urlopen(local.url,timeout=10) - except urllib.error.HTTPError as HERR: - if HERR.code == 404:continue - except:local.html = urllib.request.urlopen(local.url,timeout=25) - for i in range(2): - try: - local.data = local.html.read() - break - except:continue - images.append(find_image(local.data)) - -# Functions - -def mksoup(url): - tmp = urllib.request.urlopen(url) - return bs4.BeautifulSoup(tmp.read(),"lxml") - -def find_image(data): - return bs4.BeautifulSoup(data,"lxml").find('img','pure-img').get('src') - -def post_req(): - tmp = urllib.parse.urlencode({ 'listid' : list_id , 'offset' : offset}) - return urllib.request.urlopen("http://www.listal.com/item-list/",tmp.encode()) - -def mkqueue(url): - global no_pics,no_pages - no_pics = int(mksoup(url).find('a','picturesbutton').span.text.strip()) - no_pages = no_pics/20 - if no_pages.is_integer():no_pages = int(no_pages) - else:no_pages = int(no_pages) + 1 - for i in range(int(args.first_page),no_pages+1):qq.put(url+"/pictures//"+str(i)) - -def enqueue(): - global qq,ipages - if not qq.empty():print("WARNING : Queue was not empty.") - qq = queue.Queue() - ipages = sorted(set(ipages)) - for each in ipages: - qq.put("http://www.listal.com/viewimage/"+str(each)+"h") - -def stop_at(IMG): - tmp = [] - for each in ipages: - if each > IMG:tmp.append(each) - ipages = tmp - -def update_progress(): - progress = 100 - int((100*qq.qsize()) / len(ipages)) - pbar = "\r {:0>3}% [{:<50}] ({},{})".format(progress, '#'*int((progress/2)), (len(ipages)-qq.qsize()), len(ipages)) - sys.stdout.write(pbar) - sys.stdout.flush() - -def get_listinfo(url): - global list_type,list_id,list_name,total_pic,offset - soup = mksoup(url) - list_type = soup.find(id='customlistitems').get('data-listformat') - if list_type != "images": - print("This is not a Image list. Currently listal.dl suppots only Image lists.") - quit() - list_id = int(soup.find(id='customlistitems').get('data-listid')) - try:list_name = soup.find('div','headertitle').text.strip() - except AttributeError:list_name = urls.path[6:].replace('-',' ').title() - total_pic = int(soup.find(id='customlistitems').div.get('data-itemtotal')) - offset = int(soup.find('div','loadmoreitems').get('data-offset')) - for each in soup.find_all('div','imagelistbox'): - ipages.append(int(each.a.get('href').strip().split('/')[-1])) - -def get_list(): - global offset - while True: - data = post_req().read() - for each in sorted(set(re.findall("viewimage\\\/([0-9]{4,10})'" ,data.decode()))): - ipages.append(int(each)) - offset = offset + 1 - if offset == total_pic:break - -def write(): - if urls.path.startswith("/list/"):fhand = open(list_name+".txt",'a') - else:fhand = open(name+".txt",'a') - fhand.write("### {} : {} Images\n".format(finished,len(images))) - for each in images:fhand.write(each+"\n") - fhand.close() - -# Global - -qq = queue.Queue() -threads = [] -ipages = [] -images = [] -IMG = None -STOP_AT = None -started = time.time() - -# Main - -parser = argparse.ArgumentParser(description='Scrape Images from \'listal.com\'.') -parser.add_argument('url', type=str, - help='URL to the List or Profile on listal.com.') -parser.add_argument('--from', dest='first_page', type = int, default = None, required = False, - help='The profile page no to start scraping images from') -parser.add_argument('--upto', dest='last_page' , type = int, default = None, required = False, - help='Scrap images only upto the page no.') -parser.add_argument('--threads', dest='threads', type = int, default = 10, required = False, - help='No. of threads to use.') -args = parser.parse_args() - -urls = urllib.parse.urlparse(args.url) -if urls.netloc != 'www.listal.com': - print ("Check the Entered URL.") - quit() - -if urls.path.startswith("/list/"): - if args.first_page is not None:print("Entered URL is of a list. The '--from' option is ignored.") - if args.last_page is not None:print("Entered URL is of a list. The '--upto' option is ignored.") - get_listinfo(urls.geturl()) - get_list() -else: - urls = urllib.parse.urlparse(urls.geturl().split('/picture')[0]) - name = urls.path[1:].replace('-',' ').title() - if args.first_page is None:args.first_page = 1 - if args.last_page is not None: - for i in range(args.first_page,args.last_page+1):qq.put(args.url+"/pictures//"+str(i)) - else:mkqueue(urls.geturl()) - for n in range(args.threads): - t = threading.Thread(target=get_ipages) - threads.append(t) - t.start() - for t in threads:t.join() - - -print("Time Taken :",time.strftime("%H:%M:%S",time.gmtime(time.time()-started))) #DEBUG -enqueue() -threads.clear() -for n in range(args.threads): - t = threading.Thread(target=get_images) - threads.append(t) - t.start() - -while not qq.empty(): - update_progress() - sys.stdout.flush() - time.sleep(1) - -for t in threads:t.join() - -time_taken = time.time() - started -finished = time.strftime("%d/%m/%Y %H:%M",time.localtime()) -write() -print("Time Taken :",time.strftime("%H:%M:%S",time.gmtime(time_taken))) - -# END From 6aecdd05a437ac3a6b174e1cd004857a4b41d23f Mon Sep 17 00:00:00 2001 From: vintol Date: Sun, 31 Mar 2019 14:13:59 +0530 Subject: [PATCH 2/2] Delete listal-dl.py --- listal-dl.py | 220 --------------------------------------------------- 1 file changed, 220 deletions(-) delete mode 100644 listal-dl.py diff --git a/listal-dl.py b/listal-dl.py deleted file mode 100644 index c5cc5d2..0000000 --- a/listal-dl.py +++ /dev/null @@ -1,220 +0,0 @@ -# -# Listal-dl -# -# v0.21 28/08/2016 -# -# Available under GNU GPL v3 -# -# listal-dl Copyright (C) 2016 Tejas Kumar -# -# This program comes with ABSOLUTELY NO WARRANTY. -# This is free software, and you are welcome to redistribute it -# under certain conditions; see file "LICENSE". -# -import urllib.request -from bs4 import * -import queue -import threading -import os -import time - -## - -class Imager (threading.Thread): - def __init__(self, threadID, queue, lock, function, store): - threading.Thread.__init__(self) - self.threadID = threadID - self.name = threadID - self.queue = queue - self.lock = lock - self.execution_function = function - self.output_store = store - - def run(self): - while not self.queue.empty(): - self.lock.acquire() - self.item = self.queue.get() - print(self.name,"got item",self.item) - self.lock.release() - self.execution_function(self) - if self.output_store is not None: - self.lock.acquire() - self.output_store.append(self.output) - self.lock.release() - self.queue.task_done() - -# - -def ipages(self): - try: - self.html = urllib.request.urlopen(self.item,timeout=2) - except: - while True: - try: - self.html = urllib.request.urlopen(self.item,timeout=5) - if self.html.getcode() == 200:break - except:continue - try:self.html_data = self.html.read() - except: - self.lock.acquire() - self.queue.put(self.item) - self.lock.release() - self.output = "\n" - return - self.soup = BeautifulSoup(self.html_data,"lxml") - self.output = [] - for link in self.soup.find_all('a'): - if link.get('href').startswith("http://www.listal.com/viewimage"): - self.output.append(link.get('href')+"h") -# - -def limages(self): - try: - self.html = urllib.request.urlopen(self.item,timeout=2) - except: - while True: - try: - self.html = urllib.request.urlopen(self.item,timeout=5) - if self.html.getcode() == 200:break - except:continue - try:self.html_data = self.html.read() - except: - self.lock.acquire() - self.queue.put(self.item) - self.lock.release() - self.output = "\n" - return - self.soup = BeautifulSoup(self.html_data,"lxml") - self.output = self.soup.find(title=name).get('src') - -# - -def idownload(self): - self.iname = self.item.split()[0] - self.link = self.item.split()[1] - try: - self.html = urllib.request.urlopen(self.link,timeout=10) - except: - while True: - try: - self.html = urllib.request.urlopen(self.link,timeout=100) - if self.html.getcode() == 200:break - except:continue - try:self.html_data = self.html.read() - except: - self.lock.acquire() - self.queue.put(self.item) - self.lock.release() - return - while True: - try: - open(self.iname,'wb').write(self.html_data) - except:continue - break - -# - -def pages(): - - url_name = name.strip().lower().replace(' ','-') - page_start = int(input("Start at Page No. : ")) - page_end = int(input("End at Page No. : ")) + 1 - no_threads = int(input("No. of Threads:")) - - for i in range(page_start,page_end): - qq.put("http://www.listal.com/"+url_name+"/pictures//"+str(i)) - - for n in range(no_threads): - t = Imager("thread-{}".format(n),qq,thlock,ipages,output) - threads.append(t) - t.start() - - qq.join() - - for t in threads: - t.join() - - for bulk in output: - for link in bulk: - links.append(link) - -# Now Image Pages to Image Links - - for each in links:qq.put(each) - output.clear() - - for n in range(no_threads): - t = Imager("thread-{}".format(n),qq,thlock,limages,output) - threads.append(t) - t.start() - - qq.join() - - for t in threads: - t.join() - - fhand = open("Images",'a') - for link in output: - fhand.write(link+"\n") - -# - -def images_download(): - - links = open("Images",'r').read().split() - if len(links) <= 8000: - for i in range(len(links)): - qq.put("{} {}".format("D"+str(1001+i)+".jpg",links[i])) - elif len(links) > 8000: - for i in range(8000): - qq.put("{} {}".format("D"+str(1001+i)+".jpg",links[i])) - for i in range(len(links)-8000): - qq.put("{} {}".format("E"+str(1001+i)+".jpg",links[8000+i])) - - for n in range(int(input("No. of Threads:"))): - t = Imager("thread-{}".format(n),qq,thlock,idownload,None) - threads.append(t) - t.start() - - qq.join() - - for t in threads: - t.join() - -## - -print (""" listal-dl Copyright (C) 2016 Tejas Kumar - - This program comes with ABSOLUTELY NO WARRANTY. - This is free software, and you are welcome to redistribute it - under certain conditions; see file "LICENSE". \n """) - -name = input("Name :") -qq = queue.Queue() -thlock = threading.Lock() -threads=[] -output=[] -links=[] - -dir_name=name.split()[0]+name.split()[1][0] -dirs = os.listdir(os.getcwd()) -if dir_name in dirs: - print(dir_name,"already exists !") - os.chdir(dir_name) - print("Moving to directory :",os.getcwd()) -else: - os.mkdir(dir_name) - os.chdir(dir_name) - print("Moving to directory :",os.getcwd()) - - -choise = input (" 0] Get Image Links \n 1] Download Images \n ===> ") -time_started = time.time() -if choise == "0":pages() -elif choise == "1":images_download() -else: print("Try Again.") - -time_taken = time.time() - time_started -print("Time Taken = {}:{}:{}".format(str(int(time_taken/3600)).zfill(2),str(int((time_taken%3600)/60)).zfill(2),str(int((time_taken%3600)%60)).zfill(2))) - -## \ No newline at end of file