^q^

KurtBestor · Jun 6, 2021 · dd4ca00 · dd4ca00
1 parent 7570075
commit dd4ca00
Show file tree

Hide file tree

Showing 17 changed files with 2,025 additions and 181 deletions.
diff --git a/src/extractor/bili_downloader.py b/src/extractor/bili_downloader.py
@@ -1,8 +1,3 @@
-# uncompyle6 version 3.5.0
-# Python bytecode 2.7 (62211)
-# Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar  4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
-# Embedded file name: bili_downloader2.pyo
-# Compiled at: 2019-10-07 03:49:44
 import downloader
 from utils import Soup, LazyUrl, Downloader, query_url, get_outdir, get_print, cut_pair, format_filename, clean_title, get_resolution, try_n
 import hashlib, json
@@ -40,12 +35,8 @@ class Video(object):
 
     def __init__(self, url, referer, id, p):
         ext = os.path.splitext(url.split('?')[0])[1]
-        self.filename = (u'{}_{}{}').format(id, p, ext)
-
-        def f(_):
-            return url
-
-        self.url = LazyUrl(referer, f, self)
+        self.filename = (u'{}.part{}{}').format(id, p, ext)
+        self.url = LazyUrl(referer, lambda _: url, self, detect_local=False)
 
 
 # 1804
@@ -76,7 +67,9 @@ class Downloader_bili(Downloader):
     URLS = ['bilibili.com', 'bilibili.tv']
     lock = True
     detect_removed = False
+    detect_local_lazy = False
     display_name = 'bilibili'
+    single = True
 
     def init(self):
         self.url = fix_url(self.url, self.cw)
@@ -108,7 +101,7 @@ def read(self):
         title = format_filename(title, self.id_, '.mp4')[:-4]
         n = int(math.ceil(8.0 / len(videos)))
         self.print_(('n_threads: {}').format(n))
-        self.enableSegment(n_threads=n)
+        self.enableSegment(n_threads=n, overwrite=True)
         self.title = title
 
     def post_processing(self):
@@ -117,8 +110,8 @@ def post_processing(self):
             outdir = get_outdir(self.type)
             out = os.path.join(outdir, self.title + '.mp4')
             ffmpeg.join(cw.names, out, cw)
-            utils.remove(self.dir)
-            self.single = True
+            for file in cw.names:
+                utils.remove(file)
             cw.setNameAt(0, out)
             del cw.imgs[1:]
             cw.dones.add(os.path.realpath(out))

diff --git a/src/extractor/iwara_downloader.py b/src/extractor/iwara_downloader.py
@@ -1,41 +1,47 @@
 from __future__ import division, print_function, unicode_literals
 import downloader
-from utils import Soup, urljoin, Downloader, LazyUrl, get_print, clean_url, clean_title
+from utils import Soup, urljoin, Downloader, LazyUrl, get_print, clean_url, clean_title, check_alive, Session, try_n
 import ree as re
 import json
 import os
 from timee import sleep
 from io import BytesIO
+TIMEOUT = 300
 
 
 
 class File(object):
     thumb = None
 
-    def __init__(self, type, url, title, referer, p=0):
+    def __init__(self, type, url, title, referer, p=0, multi_post=False):
         self.type = type
         self.url = LazyUrl(referer, lambda _: url, self)
         ext = os.path.splitext(url.split('?')[0])[1]
         if ext.lower() == '.php':
             ext = '.mp4'
         if type == 'video':
             self.filename = clean_title('{}{}'.format(title, ext))
+        elif type == 'image':
+            name = '{}_p{}'.format(clean_title(title), p) if multi_post else p
+            self.filename = '{}{}'.format(name, ext)
         else:
-            self.filename = '{}{}'.format(p, ext)
+            raise NotImplementedError(type)
         self.title = title
 
 
-class LazyVideo(object):
-    type = 'video'
+class LazyFile(object):
     _url = None
+    thumb = None
 
-    def __init__(self, url):
+    def __init__(self, url, type_, session):
         self.url = LazyUrl(url, self.get, self)
+        self.type = {'videos': 'video', 'images': 'image'}.get(type_) or type_
+        self.session = session
 
     def get(self, url):
         if self._url:
             return self._url
-        file = get_files(url)[0]
+        file = get_files(url, self.session, multi_post=True)[0]
         self.title = file.title
         self.thumb = file.thumb
         self.filename = file.filename
@@ -56,42 +62,61 @@ def fix_url(cls, url):
         url = clean_url(url)
         return url.split('?')[0]
 
+    def init(self):
+        self.session = Session()
+        self.session.cookies.update({'show_adult': '1', 'has_js': '1'})
+        self.setTimeout(TIMEOUT)
+
     def read(self):
+        file = None
+        files = None
+        title = None
         if '/users/' in self.url or '/user/' in self.url:
             type_ = 'videos'
             try:
                 if self.url.split('/users/')[1].split('/')[1] == 'images':
                     type_ = 'images'
             except:
                 pass
-            info = read_channel(self.url, type_, self.cw)
+            info = read_channel(self.url, type_, self.session, self.cw)
+            title = info['title']
             urls = info['urls']
             if type_ == 'videos':
-                files = [LazyVideo(url) for url in urls]
-                file = self.process_playlist('[Channel] [{}] {}'.format(type_.capitalize(), info['title']), files)
+                files = [LazyFile(url, type_, self.session) for url in urls]
+                file = self.process_playlist('[Channel] [{}] {}'.format(type_.capitalize(), title), files)
+            elif type_ == 'images':
+                files = [LazyFile(url, type_, self.session) for url in urls]
+                title = '[Channel] [{}] {}'.format(type_.capitalize(), title)
             else:
-                raise NotImplementedError('channel images')
-        else:
-            files = get_files(self.url, self.cw)
+                raise NotImplementedError(type_)
+
+        if file is None:
+            if files is None:
+                files = get_files(self.url, self.session, cw=self.cw)
             for file in files:
                 self.urls.append(file.url)
             file = files[0]
 
             if file.type == 'youtube':
                 return self.Invalid('[iwara] Youtube: {}'.format(self.url))
 
-            if file.type == 'img':
+            if file.type == 'image':
                 self.single = False
-            self.title = clean_title(file.title)
+            self.title = clean_title(title or file.title)
 
         if file.thumb is not None:
             self.setIcon(file.thumb)
-
 
 
-def read_channel(url, type_, cw=None):
+@try_n(4)
+def read_html(*args, **kwargs):
+    kwargs['timeout'] = TIMEOUT
+    return downloader.read_html(*args, **kwargs)
+
+
+def read_channel(url, type_, session, cw=None):
     print_ = get_print(cw)
-    html = downloader.read_html(url)
+    html = read_html(url, session=session)
     soup = Soup(html)
     if soup.find('div', id='block-mainblocks-user-connect'):
         username = re.find(r'''/messages/new\?user=(.+)['"]''', html, err='no username')
@@ -102,13 +127,14 @@ def read_channel(url, type_, cw=None):
     urls = []
     urls_set = set()
     for p in range(50):
+        check_alive(cw)
         url = 'https://ecchi.iwara.tv/users/{}/{}?page={}'.format(username, type_, p)
         print_(url)
-        html = downloader.read_html(url)
+        html = read_html(url, session=session)
         soup = Soup(html)
         if p == 0:
             title = soup.find('h1', class_='page-title').text
-            info['title'] = title.replace("'s videos", '').strip()
+            info['title'] = title.replace("'s videos", '').replace("'s images", '').strip()
 
         view = soup.find('div', class_='view-content')
         if view is None:
@@ -129,9 +155,9 @@ def read_channel(url, type_, cw=None):
     return info
 
 
-def get_files(url, cw=None):
+def get_files(url, session, multi_post=False, cw=None):
     print_ = get_print(cw)
-    html = downloader.read_html(url)
+    html = read_html(url, session=session)
     soup = Soup(html)
     h = soup.find('h1', class_='title')
     content = h.parent.parent.parent
@@ -143,10 +169,10 @@ def get_files(url, cw=None):
     elif video:
         type = 'video'
     else:
-        type = 'img'
+        type = 'image'
     print_(('type: {}').format(type))
     files = []
-    if type == 'img':
+    if type == 'image':
         urls = set()
         for img in content.findAll('img'):
             img = urljoin(url, img.parent.attrs['href'])
@@ -156,7 +182,7 @@ def get_files(url, cw=None):
                 print('duplicate')
                 continue
             urls.add(img)
-            file = File(type, img, title, url, len(files))
+            file = File(type, img, title, url, len(files), multi_post=multi_post)
             files.append(file)
 
     elif type == 'youtube':
@@ -168,7 +194,7 @@ def get_files(url, cw=None):
         print('url_thumb:', url_thumb)
         id = re.find('videos/([0-9a-zA-Z_-]+)', url, err='no video id')
         url_data = urljoin(url, '/api/video/{}'.format(id))
-        s_json = downloader.read_html(url_data, url)
+        s_json = read_html(url_data, url, session=session)
         data = json.loads(s_json)
         video = data[0]
         url_video = urljoin(url, video['uri'])
@@ -178,7 +204,7 @@ def get_files(url, cw=None):
         downloader.download(url_thumb, buffer=file.thumb, referer=url)
         files.append(file)
     else:
-        raise Exception(('type "{}" is not supported').format(type))
+        raise NotImplementedError(type)
     return files
 
 
diff --git a/src/extractor/kakaopage_downloader.py b/src/extractor/kakaopage_downloader.py
@@ -5,7 +5,7 @@
 from translator import tr_
 import page_selector
 import json
-UA = downloader.hdr['User-Agent']
+import clf2
 
 
 class Page(object):
@@ -28,13 +28,12 @@ def __init__(self, url, page, p):
 class Downloader_kakaopage(Downloader):
     type = 'kakaopage'
     URLS = ['page.kakao.com/home']
-    MAX_CORE = 8
+    MAX_CORE = 4
     MAX_SPEED = 4.0
     display_name = 'KakaoPage'
 
     def init(self):
         self.session = Session()
-        self.session.headers['User-Agent'] = UA
 
     @classmethod
     def fix_url(cls, url):
@@ -99,9 +98,14 @@ def get_pages(url, session):
     return pages
 
 
+def read_html(url, session):
+    res = clf2.solve(url, session=session)
+    return res['html']
+
+
 @try_n(2)
 def get_imgs_page(page, session):
-    html = downloader.read_html(page.url, session=session)
+    html = read_html(page.url, session=session)
     did = re.find('"did" *: *"(.+?)"', html, err='no did')
     url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web'
     data = {
@@ -133,7 +137,7 @@ def get_info(url, session, cw=None):
 
     info = {}
 
-    html = downloader.read_html(url, session=session)
+    html = read_html(url, session=session)
     soup = Soup(html)
 
     __NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__')
@@ -142,7 +146,7 @@ def get_info(url, session, cw=None):
         tid = data['props']['initialState']['common']['constant']['tid']
         print_('tid: {}'.format(tid))
         session.cookies['_kptid'] = tid
-        html = downloader.read_html(url, session=session)
+        html = read_html(url, session=session)
         soup = Soup(html)
 
     title = soup.find('h2').text.strip()

diff --git a/src/extractor/lhscan_downloader.py b/src/extractor/lhscan_downloader.py
@@ -66,7 +66,7 @@ def soup(self):
 
     @property
     def name(self):
-        title = self.soup.findAll('span', {'itemprop': 'name'})[-1].text.strip()
+        title = self.soup.find('ul', class_='manga-info').find('h3').text
         return clean_title(title)
 
     def read(self):

diff --git a/src/extractor/nhentai_com_downloader.py b/src/extractor/nhentai_com_downloader.py
@@ -10,7 +10,7 @@
 @Downloader.register
 class Downloader_nhentai_com(Downloader):
     type = 'nhentai_com'
-    URLS = ['nhentai.com']
+    URLS = [r'regex:https?://nhentai.com']
     MAX_CORE = 16
     display_name = 'nhentai.com'