diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..491deae --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +version: 2 +updates: +- package-ecosystem: pip + directory: "/" + schedule: + interval: daily + open-pull-requests-limit: 10 diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..d858df5 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,46 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main, develop ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest pytest-asyncio pytest-cov + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest and generate coverage report + run: | + pytest --cov=./ --cov-report=xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + file: ./coverage.xml + files: ./coverage1.xml,./coverage2.xml + directory: ./coverage/reports/ + flags: unittests + name: codecov-umbrella + fail_ci_if_error: true + verbose: true diff --git a/README.md b/README.md index 1ddf215..53fab5e 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,14 @@ # Rabbit Ark +[![Code Style](https://img.shields.io/badge/code%20style-black-black)](https://github.com/psf/black) + +> **Warning!** Only Python 3.9 or higher can guarantee normal operation. > Scalable downloader that downloads asynchronously - [Rabbit Ark](#rabbit-ark) - [Description](#description) + - [Download](#download) + - [How to use](#how-to-use) - [Supported Sites](#supported-sites) - [Script](#script) - [Special Thanks](#special-thanks) @@ -12,14 +17,36 @@ This program is inspired by [YouTube-dl](https://github.com/ytdl-org/youtube-dl/) and [Hitomi Downloader](https://github.com/KurtBestor/Hitomi-Downloader) -The download is processed using the [aiomultiprocess](https://github.com/omnilib/aiomultiprocess) module. +### Download + +You can Download in [here](https://github.com/Saebasol/rabbit-ark/releases) + +### How to use + +First you should specify which extractor to use + +```sh +rabbitark pixiv +``` + +Then provide downloadable information, such as a link, via the --downloadable argument. + +```sh + +rabbitark pixiv --downloadable pixiv-artwork-url +``` + +After a while, you can see that the folder is created and the images are down. + +For more information, check the -h argument. ## Supported Sites -| Site | URL | Extractor release life cycle | -| :-----------: | ------------------- | ---------------------------- | -| **Hitomi.la** | | Alpha | -| **Pixiv** | | Alpha | +| Site | URL | Extractor release life cycle | +| :---------: | --------------------- | ---------------------------- | +| **Hitomi** | | Alpha | +| **Pixiv** | | Alpha | +| **Youtube** | | Alpha | ## Script diff --git a/rabbitark/__init__.py b/rabbitark/__init__.py new file mode 100644 index 0000000..d300d33 --- /dev/null +++ b/rabbitark/__init__.py @@ -0,0 +1,10 @@ +import sys +from collections import namedtuple + +__version__ = "0.1.0" + +__author__ = "Ryu JuHeon" + +VersionInfo = namedtuple("VersionInfo", "major minor micro releaselevel serial") + +version_info = VersionInfo(major=0, minor=1, micro=0, releaselevel="alpha", serial=0) diff --git a/rabbitark/__main__.py b/rabbitark/__main__.py index 25bda43..f6cad35 100644 --- a/rabbitark/__main__.py +++ b/rabbitark/__main__.py @@ -1,14 +1,32 @@ import argparse -import asyncio +import logging +import platform +import sys +from asyncio.events import get_event_loop -from rabbitark.config import config +from rabbitark.config import Config +from rabbitark.extractor import load from rabbitark.rabbitark import RabbitArk -from rabbitark.utils.utils import load_cookie_txt +from rabbitark.utils import load_cookie_txt -parser = argparse.ArgumentParser("rabbitark") +logger = logging.getLogger("rabbitark") +logger.setLevel(logging.DEBUG) + +# formatter +formatter = logging.Formatter("%(asctime)s - (%(name)s) - [%(levelname)s]: %(message)s") + +# console handler +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(formatter) +logger.addHandler(ch) + + +config = Config() +parser = argparse.ArgumentParser("rabbitark") -parser.add_argument("extractor", type=str, help="Specifies the extractor to use") +parser.add_argument("extractor", type=str, help="Specifies the extractor name") parser.add_argument( "--downloadable", @@ -18,10 +36,20 @@ parser.add_argument("--base", type=str, help="Specifies the pre-created folder") -parser.add_argument("--folder", type=str, help="") +parser.add_argument("--folder", type=str, help="Specifies the folder name") parser.add_argument("--cookies", type=str, help="load cookies.txt") +parser.add_argument("--page", type=int, help="Youtube page limit") + +parser.add_argument("--custom_extractor", type=str, help="use custom extractor") + +parser.add_argument( + "--verbose", action="store_true", help="print debugging information" +) + +parser.add_argument("--report", action="store_true", help="save debugging informaion") + args = parser.parse_args() if args.base: @@ -33,6 +61,31 @@ if args.cookies: config.COOKIES = load_cookie_txt(args.cookies) -if __name__ == "__main__": - ark = RabbitArk(args.extractor) - asyncio.run(ark.start(args.downloadable)) +if args.page: + config.YOUTUBE_PAGE_LIMIT = args.page + +if args.custom_extractor: + config.CUSTOM_EXTRACTOR = args.custom_extractor + +if args.verbose: + ch.setLevel(logging.DEBUG) + +if args.report: + fh = logging.FileHandler("rabbitark.log") + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + +logger.debug("system ver: %s %s", platform.python_implementation(), sys.version) +logger.debug("platform: %s", platform.platform()) +logger.debug("args: %s", sys.argv[1:]) + +logger.info("start import extractor") +load() +logger.info("sucessfully import extractor") + +logger.debug("start loop") +get_event_loop().run_until_complete( + RabbitArk(args.extractor, config).start(args.downloadable) +) +logger.debug("complete loop") diff --git a/rabbitark/abc.py b/rabbitark/abc.py new file mode 100644 index 0000000..df29f52 --- /dev/null +++ b/rabbitark/abc.py @@ -0,0 +1,21 @@ +from abc import ABC, ABCMeta, abstractmethod +from typing import Any, Optional + +from aiohttp.client import ClientSession + +from rabbitark.config import Config +from rabbitark.dataclass import DownloadInfo + + +class BaseRequest(ABC): + @abstractmethod + def __init__(self) -> None: + self.session: Optional[ClientSession] = None + + +class BaseExtractor(BaseRequest, metaclass=ABCMeta): + @abstractmethod + async def get_download_info( + self, download_source: Any, config: Config + ) -> DownloadInfo: + pass diff --git a/rabbitark/config.py b/rabbitark/config.py index c23bb44..a397513 100644 --- a/rabbitark/config.py +++ b/rabbitark/config.py @@ -1,10 +1,24 @@ -class _Config: - __slots__ = ["BASE_DIRECTORY", "FOLDER", "COOKIES"] +from typing import Optional - def __init__(self): + +class Config: + __slots__: list[str] = [ + "BASE_DIRECTORY", + "FOLDER", + "COOKIES", + "YOUTUBE_PAGE_LIMIT", + "CUSTOM_EXTRACTOR", + "REQUEST_PER_SESSION", + ] + + def __init__(self) -> None: self.BASE_DIRECTORY: str = "." - self.FOLDER: str = None - self.COOKIES: str = {} + self.FOLDER: Optional[str] = None + self.COOKIES: dict[str, Optional[str]] = {} + self.CUSTOM_EXTRACTOR: Optional[str] = None + # extractor config + self.YOUTUBE_PAGE_LIMIT: int = 6 -config = _Config() + # download config + self.REQUEST_PER_SESSION: int = 10 diff --git a/rabbitark/dataclass.py b/rabbitark/dataclass.py new file mode 100644 index 0000000..972f0f1 --- /dev/null +++ b/rabbitark/dataclass.py @@ -0,0 +1,28 @@ +from typing import Any + +from rabbitark.utils import Optional, split + + +class Image: + def __init__(self, url: str, filename: Optional[str] = None) -> None: + self.url: str = url + self.filename: str = filename or split(url) + + +class DownloadInfo: + def __init__( + self, image: list[Image], title: Optional[str] = None, **kwargs: Any + ) -> None: + self.title = title + self.image = image + self.kwargs = kwargs + + def to_download(self, path: str): + return {image.url: path + image.filename for image in self.image} + + +class Response: + def __init__(self, status: int, message: Optional[Any], body: Any) -> None: + self.status = status + self.message = message + self.body = body diff --git a/rabbitark/downloader.py b/rabbitark/downloader.py new file mode 100644 index 0000000..e395ab8 --- /dev/null +++ b/rabbitark/downloader.py @@ -0,0 +1,56 @@ +from os.path import exists +from typing import Any, Literal, Optional + +from aiofiles import open +from aiofiles.os import mkdir +from aiohttp.client import ClientSession + +from rabbitark.config import Config +from rabbitark.dataclass import DownloadInfo +from rabbitark.request import SessionPoolRequest + + +class Downloader(SessionPoolRequest): + def __init__(self, config: Config) -> None: + self.config = config + super().__init__() + + async def download( + self, + session: ClientSession, + url: str, + method: Literal["GET"], + _: Any, + **kwargs: Any, + ): + filename = kwargs.pop("filename") + response = await session.request(method, url, **kwargs) + async with open(filename[url], "wb") as f: + async for data, _ in response.content.iter_chunks(): + await f.write(data) + + async def create_folder(self, title: Optional[str] = None) -> str: + default_dir = f"{self.config.BASE_DIRECTORY}/{self.config.FOLDER}/" + if not exists(default_dir): + await mkdir(default_dir) + + if title: + if not exists(f"{default_dir}/{title}"): + await mkdir(f"{default_dir}/{title}") + + return f"{default_dir}/{title}/" + + return default_dir + + async def start_download(self, download_info: DownloadInfo): + directory = await self.create_folder(download_info.title) + filename_mapping = download_info.to_download(directory) + url_list = list(filename_mapping.keys()) + await self.request_using_session_pool( + self.download, + url_list, + "GET", + request_per_session=self.config.REQUEST_PER_SESSION, + filename=filename_mapping, + **download_info.kwargs, + ) diff --git a/rabbitark/downloader/downloader.py b/rabbitark/downloader/downloader.py deleted file mode 100644 index 488953b..0000000 --- a/rabbitark/downloader/downloader.py +++ /dev/null @@ -1,69 +0,0 @@ -import os - -import aiofiles -import aiofiles.os as aioos -from aiomultiprocess import Pool - -from rabbitark.config import config -from rabbitark.utils import Requester -from rabbitark.utils.default_class import DownloadInfo, Info - - -class Downloader(Requester): - def __init__(self): - super().__init__() - self.base_directory = config.BASE_DIRECTORY - self.folder = config.FOLDER - - async def create_folder(self, title=None) -> None: - if not os.path.exists(f"{self.base_directory}/{self.folder}"): - await aioos.mkdir(f"{self.base_directory}/{self.folder}") - - if title: - if not os.path.exists(f"{self.base_directory}/{self.folder}/{title}"): - await aioos.mkdir(f"{self.base_directory}/{self.folder}/{title}") - - def check_folder(self, title: str, filename: str) -> str: - if title: - directory = f"{self.base_directory}/{self.folder}/{title}/{filename}" - else: - directory = f"{self.base_directory}/{self.folder}/{filename}" - - return directory - - def download_info_generator(self, info: Info) -> DownloadInfo: - for image in info.image: - yield DownloadInfo( - image.url, - self.check_folder(info.title, image.filename), - info.headers if info.headers else {}, - ) - - def checking_image_object(self, info: Info) -> DownloadInfo: - if isinstance(info.image, list): - return self.download_info_generator(info) - else: - return [ - DownloadInfo( - info.image.url, - self.check_folder(info.title, info.image.filename), - info.headers if info.headers else {}, - ) - ] - - async def download(self, download_info: DownloadInfo) -> None: - image_byte = await self.get(download_info.url, headers=download_info.headers) - async with aiofiles.open(download_info.directory, mode="wb") as f: - await f.write(image_byte.body) - - async def start_download(self, info: Info) -> None: - download_info = self.checking_image_object(info) - await self.create_folder(info.title) - async with Pool() as pool: - async for _ in pool.map(self.download, download_info): - pass - - async def start_multiple_download(self, info_list: list[Info]) -> None: - async with Pool() as pool: - async for _ in pool.map(self.start_download, info_list): - pass diff --git a/rabbitark/error.py b/rabbitark/error.py index 6e02de5..a962ade 100644 --- a/rabbitark/error.py +++ b/rabbitark/error.py @@ -7,10 +7,10 @@ class HTTPException(RabbitArkException): class NotFound(HTTPException): - def __init__(self, arg): + def __init__(self, arg: str) -> None: super().__init__(f"Can't found '{arg}'") class ExtractorNotFound(RabbitArkException): - def __init__(self, option): + def __init__(self, option: str) -> None: super().__init__(f"Can't found '{option}'") diff --git a/rabbitark/extractor/__init__.py b/rabbitark/extractor/__init__.py index d53e8b4..1375dc8 100644 --- a/rabbitark/extractor/__init__.py +++ b/rabbitark/extractor/__init__.py @@ -1,2 +1,37 @@ -from .hitomi import Hitomi -from .pixiv import Pixiv +import logging +import os +import re +import sys +import traceback +from importlib import import_module + +logger = logging.getLogger("rabbitark.extractor.__init__") + +if getattr(sys, "frozen", False): + logger.info("detect exe extractor folder as a temporary folder") + directory = getattr(sys, "_MEIPASS", os.path.abspath(os.path.dirname(__file__))) +else: + directory = os.path.dirname(os.path.realpath(__file__)) + + +def load(): + failed = [] + + for extension in [ + re.sub(".py", "", file) + for file in os.listdir(directory) + if not "__" in file + if os.path.splitext(file)[1] == ".py" + ]: + try: + logger.debug("import %s", extension) + import_module( + extension + if getattr(sys, "frozen", False) + else f"rabbitark.extractor.{extension}" + ) + except: + traceback.print_exc() + failed.append(extension) + + return failed diff --git a/rabbitark/extractor/hitomi.py b/rabbitark/extractor/hitomi.py index 4ad5fde..1f68e89 100644 --- a/rabbitark/extractor/hitomi.py +++ b/rabbitark/extractor/hitomi.py @@ -1,8 +1,12 @@ import json import re +from typing import List, Optional, Tuple -from rabbitark.utils import Requester -from rabbitark.utils.default_class import Image, Info +from rabbitark.abc import BaseExtractor +from rabbitark.config import Config +from rabbitark.dataclass import DownloadInfo, Image +from rabbitark.rabbitark import RabbitArk +from rabbitark.request import Request class HitomiImageModel: @@ -17,15 +21,15 @@ def __init__(self, width: int, hash_: str, haswebp: int, name: str, height: int) class HitomiGalleryInfoModel: def __init__( self, - language_localname: str, - language: str, - date: str, - files: list, - tags: list, - japanese_title: str, - title: str, - galleryid: int, - type_: str, + language_localname: Optional[str], + language: Optional[str], + date: Optional[str], + files: Optional[List], + tags: Optional[List], + japanese_title: Optional[str], + title: Optional[str], + galleryid: Optional[int], + type_: Optional[str], ): self.language_localname = language_localname self.language = language @@ -38,21 +42,20 @@ def __init__( self.type_ = type_ -class HitomiRequester(Requester): - def __init__(self): - super().__init__( - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", - "referer": "https://hitomi.la", - } - ) +class HitomiRequester(Request): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", + "referer": "https://hitomi.la", + } - async def get_galleryinfo(self, index): + async def get_galleryinfo(self, index: int): response = await self.get(f"https://ltn.hitomi.la/galleries/{index}.js", "text") - js_to_json = response.body.replace("var galleryinfo = ", "") + js_to_json = response.replace("var galleryinfo = ", "") return parse_galleryinfo(json.loads(js_to_json)) - async def images(self, index: int) -> tuple[list[Image], HitomiGalleryInfoModel]: + async def images( + self, index: int + ) -> Optional[Tuple[List[Image], HitomiGalleryInfoModel]]: galleryinfomodel = await self.get_galleryinfo(index) if not galleryinfomodel: return None @@ -63,17 +66,19 @@ async def images(self, index: int) -> tuple[list[Image], HitomiGalleryInfoModel] return images, galleryinfomodel -class Hitomi(HitomiRequester): - def __init__(self) -> None: - super().__init__() +@RabbitArk.register("hitomi") +class Hitomi(HitomiRequester, BaseExtractor): + async def get_download_info( + self, download_source: int, config: Config + ) -> DownloadInfo: + images: Optional[ + Tuple[List[Image], HitomiGalleryInfoModel] + ] = await self.images(download_source) - async def download_info(self, index) -> Info: - images, model = await self.images(index) - return Info(images, model.galleryid, self.headers) + if not images: + return None - async def multiple_download_info(self, index_list: list): - for index in index_list: - yield self.download_info(index) + return DownloadInfo(images[0], images[1].galleryid, headers=self.headers) def subdomain_from_galleryid(g: int, number_of_frontends: int) -> str: @@ -82,7 +87,7 @@ def subdomain_from_galleryid(g: int, number_of_frontends: int) -> str: return r -def subdomain_from_url(url: str) -> str: +def subdomain_from_url(url: str) -> Optional[str]: retval = "b" number_of_frontends = 3 @@ -91,6 +96,9 @@ def subdomain_from_url(url: str) -> str: r = re.compile(r"\/[0-9a-f]\/([0-9a-f]{2})\/") m = r.search(url) + if not m: + return None + g = int(m[1], b) if g < 0x30: @@ -156,7 +164,7 @@ def image_url_from_image(galleryid: int, image: HitomiImageModel, no_webp: bool) def parse_galleryinfo(galleryinfo_json: dict) -> HitomiGalleryInfoModel: if not galleryinfo_json["tags"]: - parsed_tags = [] + parsed_tags: List = [] else: parsed_tags = [] for tag in galleryinfo_json["tags"]: diff --git a/rabbitark/extractor/pixiv.py b/rabbitark/extractor/pixiv.py index 746a0ee..fad2d1b 100644 --- a/rabbitark/extractor/pixiv.py +++ b/rabbitark/extractor/pixiv.py @@ -2,12 +2,13 @@ from rabbitark.config import config from rabbitark.error import NotFound -from rabbitark.utils.default_class import Image, Info -from rabbitark.utils.request import Requester +from rabbitark.rabbitark import RabbitArk +from rabbitark.utils.default_class import DownloadInfo, Image +from rabbitark.utils.request import Request from rabbitark.utils.utils import folder_name_checker, get_urls, split -class PixivRequester(Requester): +class PixivRequester(Request): def __init__(self): super().__init__( headers={ @@ -78,7 +79,7 @@ async def single_images(self, illust_id): if not info: return urls = await self.get_illust_urls(illust_id) - return Info( + return DownloadInfo( [Image(url) for url in urls], folder_name_checker(info["body"]["title"]), self.headers, @@ -89,18 +90,19 @@ async def user(self, user_id): if not username: return url_list = await self.user_images(user_id) - return Info( + return DownloadInfo( [Image(url) for url in url_list], folder_name_checker(username), self.headers, ) +@RabbitArk.register("pixiv") class Pixiv(PixivRequester): def __init__(self): super().__init__() - async def download_info(self, downloadable: Any) -> Info: + async def extractor_download(self, downloadable: Any) -> DownloadInfo: if downloadable.isdigit(): info = await self.checking_id(downloadable) else: diff --git a/rabbitark/extractor/youtube.py b/rabbitark/extractor/youtube.py new file mode 100644 index 0000000..c05c820 --- /dev/null +++ b/rabbitark/extractor/youtube.py @@ -0,0 +1,348 @@ +""" +Youtube Playlist Extractor + +https://github.com/kijk2869/discodo/blob/master/discodo/extractor/youtube/playlist.py + +Used in function: extract_playlist + +MIT License + +Copyright (c) 2020 매리 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +Youtube URL Regex + +youtube_dl/extractor/youtube.py + +Used in constant: VALID_URL, PLAYLIST_VAILD_URL + +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to +""" + +import json +import re +from typing import Any, Dict, List, Match, Optional, Pattern + +from rabbitark.config import config +from rabbitark.error import NotFound +from rabbitark.rabbitark import RabbitArk +from rabbitark.utils.default_class import DownloadInfo, Image, Response +from rabbitark.utils.request import Request + +VALID_URL: str = r"""(?x)^ + ( + (?:https?://|//) # http(s):// or protocol-independent URL + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| + (?:www\.)?deturl\.com/www\.youtube\.com/| + (?:www\.)?pwnyoutube\.com/| + (?:www\.)?hooktube\.com/| + (?:www\.)?yourepeat\.com/| + tube\.majestyc\.net/| + # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances + (?:(?:www|dev)\.)?invidio\.us/| + (?:(?:www|no)\.)?invidiou\.sh/| + (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.nixnet\.xyz/| + (?:www\.)?invidious\.drycat\.fr/| + (?:www\.)?tube\.poal\.co/| + (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?yewtu\.be/| + (?:www\.)?yt\.elukerio\.org/| + (?:www\.)?yt\.lelux\.fi/| + (?:www\.)?invidious\.ggc-project\.de/| + (?:www\.)?yt\.maisputain\.ovh/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.toot\.koeln/| + (?:www\.)?invidious\.fdn\.fr/| + (?:www\.)?watch\.nettohikari\.com/| + (?:www\.)?kgg2m7yk5aybusll\.onion/| + (?:www\.)?qklhadlycap4cnod\.onion/| + (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| + (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| + (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| + (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| + (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| + (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/| + youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ + |(?: # or the v= param in all its forms + (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) + v= + ) + )) + |(?: + youtu\.be| # just youtu.be/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx + )/ + |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= + ) + )? # all until now is optional -> you can pass the naked ID + ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID + (?!.*?\blist= + (?: + %(playlist_id)s| # combined list/video URLs are handled by the playlist IE + WL # WL are handled by the watch later IE + ) + ) + (?(1).+)? # if we found the ID, everything can follow + $""" + +PLAYLIST_VALID_URL: str = r"""(?x)(?: + (?:https?://)? + (?:\w+\.)? + (?: + (?: + youtube(?:kids)?\.com| + invidio\.us + ) + / + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) + \? (?:.*?[&;])*? (?:p|a|list)= + | p/ + )| + youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= + ) + ( + (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} + # Top tracks, they can also include dots + |(?:MC)[\w\.]* + ) + .* + | + (%(playlist_id)s) + )""" + +DATA_JSON: Pattern = re.compile( + r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' +) + +PLAYLIST_PREFIX_LIST: List[str] = [ + "PL", + "LL", + "EC", + "UU", + "FL", + "RD", + "UL", + "TL", + "PU", + "OLAK5uy_", +] + + +class YoutubeRequester(Request): + def __init__(self) -> None: + super().__init__( + headers={ + "x-youtube-client-name": "1", + "x-youtube-client-version": "2.20201030.01.00", + } + ) + + async def extract_playlist(self, playlist_id: str) -> List[Dict[str, Any]]: + if playlist_id.startswith(("RD", "UL", "PU")): + raise TypeError("playlistId is Youtube Mix id") + + response: Response = await self.get( + f"https://www.youtube.com/playlist", + "text", + params={"list": playlist_id, "hl": "en"}, + ) + + search: Optional[Match] = DATA_JSON.search(response.body) + + if not search: + raise ValueError + + data: Dict[str, Any] = json.loads(search.group(1)) + + if data.get("alerts"): + raise Exception(data["alerts"][0]["alertRenderer"]["text"]["simpleText"]) + + firstPlaylistData: Dict[str, Any] = data["contents"][ + "twoColumnBrowseResultsRenderer" + ]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0][ + "itemSectionRenderer" + ][ + "contents" + ][ + 0 + ][ + "playlistVideoListRenderer" + ] + + Sources: List = [] + + def extract_playlist( + playlistData: Dict, name: str = "contents" + ) -> Optional[str]: + trackList: Optional[List[Any]] = playlistData.get(name) + if not trackList: + return None + + continuationsTokens: list = [] + + def extract(Track: dict) -> Optional[dict]: + if "playlistVideoRenderer" in Track: + renderer: Dict = Track.get("playlistVideoRenderer", {}) + shortBylineText: Optional[Dict] = renderer.get("shortBylineText") + + if not renderer.get("isPlayable") or not shortBylineText: + return None + + return { + "id": renderer["videoId"], + "title": renderer["title"].get("simpleText") + or renderer["title"]["runs"][0]["text"], + "webpage_url": "https://youtube.com/watch?v=" + + renderer["videoId"], + "uploader": shortBylineText["runs"][0]["text"], + "duration": renderer["lengthSeconds"], + } + elif "continuationItemRenderer" in Track: + continuationsTokens.append( + Track["continuationItemRenderer"]["continuationEndpoint"][ + "continuationCommand" + ]["token"] + ) + + return None + else: + return None + + Sources.extend(map(extract, trackList)) + + if not continuationsTokens: + return None + + return ( + "https://www.youtube.com/browse_ajax?continuation=" + + continuationsTokens[0] + + "&ctoken=" + + continuationsTokens[0] + + "&hl=en" + ) + + continuations_url: Optional[str] = extract_playlist(firstPlaylistData) + for _ in range(config.YOUTUBE_PAGE_LIMIT): + if not continuations_url: + break + + body = await self.get(continuations_url, "json") + + nextPlaylistData: Dict = body.body[1]["response"][ + "onResponseReceivedActions" + ][0]["appendContinuationItemsAction"] + + continuations_url = extract_playlist( + nextPlaylistData, name="continuationItems" + ) + + return list(filter(None, Sources)) + + async def checking_url(self, url): + single_video = re.findall(VALID_URL, url) + if single_video: + return self.make_info_video(single_video[0][1]) + + playlist = re.findall(PLAYLIST_VALID_URL, url) + if playlist: + return await self.make_info_playlist(playlist[0][0]) + + return + + async def checking_id(self, yt_id: str): + response = await self.get( + f"https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v={yt_id}" + ) + if response.status != 200: + for prefix in PLAYLIST_PREFIX_LIST: + if yt_id.startswith(prefix): + return await self.make_info_playlist(yt_id) + + else: + return + + return self.make_info_video(yt_id) + + def get_thumbnail(self, video_id): + return Image( + f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg", + f"{video_id}.jpg", + ) + + def make_info_video(self, video_id): + return DownloadInfo(self.get_thumbnail(video_id), video_id) + + async def make_info_playlist(self, playlist_id): + video_infos = await self.extract_playlist(playlist_id) + return DownloadInfo( + [self.get_thumbnail(info["id"]) for info in video_infos], + playlist_id, + ) + + +@RabbitArk.register("youtube") +class Youtube(YoutubeRequester): + def __init__(self): + super().__init__() + + async def extractor_download(self, downloadable) -> DownloadInfo: + checking_with_url = await self.checking_url(downloadable) + if checking_with_url: + return checking_with_url + + cheking_with_yt_id = await self.checking_id(downloadable) + if cheking_with_yt_id: + return cheking_with_yt_id + + raise NotFound(downloadable) diff --git a/rabbitark/rabbitark.py b/rabbitark/rabbitark.py index d34bf2e..68acd23 100644 --- a/rabbitark/rabbitark.py +++ b/rabbitark/rabbitark.py @@ -1,30 +1,33 @@ -import os from typing import Any -from rabbitark.downloader.downloader import Downloader -from rabbitark.error import ExtractorNotFound -from rabbitark.utils.extractor_dict import extractor -from rabbitark.utils.load_dynamic_module import load_extensions +from rabbitark.abc import BaseExtractor +from rabbitark.config import Config +from rabbitark.downloader import Downloader class RabbitArk(Downloader): - def __init__(self, option): - super().__init__() - self.option = option - - async def start(self, downloadable: Any = None): - if self.option in extractor: - init_class = extractor[self.option]() - - elif os.path.isfile(self.option): - init_class = load_extensions(self.option) - - else: - raise ExtractorNotFound(self.option) - - if isinstance(downloadable, list): - infos = await init_class.multiple_download_info(downloadable) - await self.start_multiple_download(infos) - else: - info = await init_class.download_info(downloadable) - await self.start_download(info) + extractor_dict: dict[str, type[BaseExtractor]] = {} + + def __init__(self, extractor_name: str, config: Config) -> None: + self.extractor_name = extractor_name + super().__init__(config) + + @classmethod + def register(cls, extractor_name: str): + def wrapper(wrapped_class: type[BaseExtractor]): + + cls.extractor_dict[extractor_name] = wrapped_class + return wrapped_class + + return wrapper + + async def start(self, download_source: Any) -> None: + init_class = self.extractor_dict[self.extractor_name]() + try: + download_info = await init_class.get_download_info( + download_source, self.config + ) + finally: + if init_class.session: + await init_class.session.close() + await self.start_download(download_info) diff --git a/rabbitark/request.py b/rabbitark/request.py new file mode 100644 index 0000000..c1d081e --- /dev/null +++ b/rabbitark/request.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +from asyncio.tasks import wait +from functools import wraps +from math import ceil +from typing import Any, Awaitable, Callable, Optional, cast + +from aiohttp import ClientSession + +from rabbitark.abc import BaseExtractor, BaseRequest +from rabbitark.typing import CA, RETURN_METHOD + + +def close(f: CA) -> CA: + @wraps(f) + async def wrapper(self: BaseExtractor, *args: Any, **kwargs: Any): + try: + result = await f(self, *args, **kwargs) + finally: + if self.session: + await self.session.close() + return result + + return cast(CA, wrapper) + + +class Request(BaseRequest): + def __init__(self) -> None: + self.session: Optional[ClientSession] = None + + async def request( + self, + session: ClientSession, + url: str, + method: str, + return_method: RETURN_METHOD, + **kwargs: Any, + ): + response = await session.request(method, url, **kwargs) + return await getattr(response, return_method)() + + async def get( + self, + url: str, + return_method: RETURN_METHOD, + ) -> Any: + if not self.session: + print("we make session") + self.session = ClientSession() + return await self.request(self.session, url, "GET", return_method) + + async def post( + self, + url: str, + return_method: RETURN_METHOD, + ) -> Any: + if not self.session: + self.session = ClientSession() + return await self.request(self.session, url, "POST", return_method) + + +class SessionPoolRequest(Request): + def __init__(self) -> None: + self._session_pool: list[ClientSession] = [] + + @property + def session_pool(self) -> list[ClientSession]: + return self._session_pool + + async def request_using_session_pool( + self, + request_func: Callable[..., Awaitable[Any]], + url: list[str], + method: str, + return_method: Optional[RETURN_METHOD] = None, + request_per_session: int = 10, + **kwargs: Any, + ): + try: + pool_size = ceil(len(url) / request_per_session) + + request_per_session_url_list = [ + url[pos : pos + pool_size] for pos in range(0, len(url), pool_size) + ] + + for _ in range(len(request_per_session_url_list)): + self.session_pool.append(ClientSession()) + + request_list = [ + request_func(session, url, method, return_method, **kwargs) + for session, url_list in zip( + self.session_pool, request_per_session_url_list + ) + for url in url_list + ] + + done, _ = await wait(request_list) + return done + + finally: + while self.session_pool: + session = self.session_pool.pop(0) + await session.close() + + async def multiple_get( + self, + url: list[str], + return_method: RETURN_METHOD, + request_per_session: int = 10, + **kwargs: Any, + ): + return await self.request_using_session_pool( + self.request, url, "GET", return_method, request_per_session, **kwargs + ) + + async def multiple_post( + self, + url: list[str], + return_method: RETURN_METHOD, + request_per_session: int = 10, + **kwargs: Any, + ): + return await self.request_using_session_pool( + self.request, url, "POST", return_method, request_per_session, **kwargs + ) diff --git a/rabbitark/typing.py b/rabbitark/typing.py new file mode 100644 index 0000000..888b3de --- /dev/null +++ b/rabbitark/typing.py @@ -0,0 +1,5 @@ +from typing import Any, Awaitable, Callable, Literal, TypeVar + +CA = TypeVar("CA", bound=Callable[..., Awaitable[Any]]) + +RETURN_METHOD = Literal["json", "text", "read"] diff --git a/rabbitark/utils.py b/rabbitark/utils.py new file mode 100644 index 0000000..346b45d --- /dev/null +++ b/rabbitark/utils.py @@ -0,0 +1,36 @@ +import logging +from http.cookiejar import MozillaCookieJar +from http.cookies import SimpleCookie +from importlib.util import module_from_spec, spec_from_file_location +from os.path import basename +from re import sub +from typing import Optional +from urllib.parse import urlparse + +logger = logging.getLogger("rabbitark.utils.load_dynamic_module") + + +def import_dynamic_module(path: str): + logger.debug("extractor path: %s", path) + spec = spec_from_file_location("dynamic_module", path) + module_from_spec(spec) + logger.info("sucees import") + + +def split(url: str) -> str: + a = urlparse(url) + return basename(a.path) + + +def load_rawcookie(rawdata: str) -> dict[str, str]: + cookie: SimpleCookie[str] = SimpleCookie(rawdata) + return {key: morsel.value for key, morsel in cookie.items()} + + +def load_cookie_txt(filename: str) -> dict[str, Optional[str]]: + cj = MozillaCookieJar(filename) + return {each.name: each.value for each in cj} + + +def folder_name_checker(foldername: str) -> str: + return sub(r"[\\/:*?\"<>\|]", "", foldername) diff --git a/rabbitark/utils/__init__.py b/rabbitark/utils/__init__.py deleted file mode 100644 index a7862d7..0000000 --- a/rabbitark/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .default_class import DownloadInfo, Image, Info -from .request import Requester -from .utils import * diff --git a/rabbitark/utils/default_class.py b/rabbitark/utils/default_class.py deleted file mode 100644 index ffe4b4b..0000000 --- a/rabbitark/utils/default_class.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Any - -from rabbitark.utils.utils import split - - -class Image: - def __init__(self, url: str, filename: str = None): - self.url = url - self.filename = filename if filename else split(url) - - -class Info: - def __init__(self, image: list[Image], title: str = None, headers: Any = None): - self.title = title - self.image = image - self.headers = headers - - -class DownloadInfo: - def __init__(self, url: str, directory: str, headers: Any = None): - self.url = url - self.directory = directory - self.headers = headers - - -class Response: - def __init__(self, status: int, message: str, body: Any): - self.status = status - self.message = message - self.body = body diff --git a/rabbitark/utils/extractor_dict.py b/rabbitark/utils/extractor_dict.py deleted file mode 100644 index 33a18e8..0000000 --- a/rabbitark/utils/extractor_dict.py +++ /dev/null @@ -1,3 +0,0 @@ -from rabbitark.extractor import Hitomi, Pixiv - -extractor = {"hitomi": Hitomi, "pixiv": Pixiv} diff --git a/rabbitark/utils/load_dynamic_module.py b/rabbitark/utils/load_dynamic_module.py deleted file mode 100644 index 5c37f6e..0000000 --- a/rabbitark/utils/load_dynamic_module.py +++ /dev/null @@ -1,14 +0,0 @@ -import importlib.util - - -def import_dynamic_module(path: str): - spec = importlib.util.spec_from_file_location("dynamic_module", path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def load_extensions(path): - module = import_dynamic_module(path) - main_class = module.load() - return main_class diff --git a/rabbitark/utils/request.py b/rabbitark/utils/request.py deleted file mode 100644 index 374009e..0000000 --- a/rabbitark/utils/request.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Any, Optional - -import aiohttp - -from rabbitark.utils.default_class import Response - - -class Requester: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - @property - def headers(self): - return self.kwargs.get("headers") - - async def request( - self, url: str, method: str, response_method: str, *args, **kwargs - ) -> Response: - async with aiohttp.ClientSession(*self.args, **self.kwargs) as cs: - async with cs.request(method, url, *args, **kwargs) as response: - dispatch = { - "json": response.json, - "read": response.read, - "text": response.text, - } - if response_method not in dispatch: - raise ValueError( - f"Invalid response_method value: {response_method}" - ) - return Response( - response.status, response.reason, await dispatch[response_method]() - ) - - async def get( - self, url: str, response_method: str = "read", *args, **kwargs - ) -> Response: - """Perform HTTP GET request.""" - return await self.request(url, "GET", response_method, *args, **kwargs) - - async def post(self, url: str, response_method: str, *args, **kwargs) -> Response: - """Perform HTTP POST request.""" - return await self.request(url, "POST", response_method, *args, **kwargs) diff --git a/rabbitark/utils/utils.py b/rabbitark/utils/utils.py deleted file mode 100644 index cbb5be5..0000000 --- a/rabbitark/utils/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -import re -from http.cookiejar import MozillaCookieJar -from http.cookies import SimpleCookie - -from aiomultiprocess import Pool - - -def split(url: str): - return url.rsplit("/", 1)[1] - - -async def get_urls(func, arg: list) -> None: - result = [] - async with Pool() as pool: - async for url in pool.map(func, arg): - if isinstance(url, list): - result.extend(url) - else: - result.append(url) - - return result - - -def load_rawcookie(rawdata): - cookie = SimpleCookie() - cookie.load(rawdata) - return {key: morsel.value for key, morsel in cookie.items()} - - -def load_cookie_txt(filename): - cj = MozillaCookieJar() - cj.load(filename) - return {each.name: each.value for each in cj} - - -def folder_name_checker(foldername): - return re.sub(r"[\\/:*?\"<>\|]", "", foldername) diff --git a/requirements.txt b/requirements.txt index 8980818..addd13b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -aiohttp==3.7.2 -aiomultiprocess==0.8.0 -aiofiles==0.6.0 \ No newline at end of file +aiohttp==3.7.4.post0 +aiomultiprocess==0.9.0 +aiofiles==0.7.0 \ No newline at end of file diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..6c73dbf --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,8 @@ +import pytest + +from rabbitark.rabbitark import RabbitArk + + +@pytest.fixture +def rabbitark(option): + return RabbitArk(option) diff --git a/test/test_download.py b/test/test_download.py new file mode 100644 index 0000000..591bf7f --- /dev/null +++ b/test/test_download.py @@ -0,0 +1,24 @@ +import pytest + +from rabbitark.extractor import load + + +@pytest.mark.asyncio +@pytest.mark.parametrize("option", ["hitomi"]) +async def test_download_hitomi(rabbitark): + load() + await rabbitark.start("1") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("option", ["pixiv"]) +async def test_download_pixiv(rabbitark): + load() + await rabbitark.start("9666585") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("option", ["youtube"]) +async def test_download_youtube(rabbitark): + load() + await rabbitark.start("PLB6rrfCPynfApD_C0yItgW5WLC0f-wDvG")