TilCreator · o4ugDF54PlqU · Jun 17, 2021 · Jun 17, 2021
diff --git a/README.md b/README.md
@@ -35,9 +35,10 @@ This is a downloader to download and update whole comics from https://tapas.io/.
    * (thx @ [ONSKJ](https://github.com/ONSKJ) for help with Windows)
  + If someone got it running on another OS, please let me know!
 2. Get input link
- * Go to the comic you want to download (any page)
+ * Go to the comic you want to download (any episode)
  * Rightclick on the comic name in the upper left corner and select "Copy linkaddress" (Or similar) or just use the name behind series in the url.
  * Examples: `https://tapas.io/series/Erma`, `RavenWolf`, ...
+ * Optional - get the episode ID(s). eg 255222 for `https://tapas.io/episode/255222` (the first episode of Erma).
 3. Start the download
  * Usage of `tapas-dl.py`:
  ```
@@ -50,24 +51,32 @@ This is a downloader to download and update whole comics from https://tapas.io/.
 
  positional arguments:
    URL/name              URL or URL name to comic
-                         Go to the comic you want to download (any page)
+                         Go to the comic you want to download (any episode)
                          Rightclick on the comic name in the upper left corner and select "Copy linkaddress" (Or similar) or just use the name behind series in the url
                          Examples: https://tapas.io/series/Erma, RavenWolf, ...
 
  optional arguments:
-   -h, --help            show this help message and exit
-   -f, --force           Disables updater.
-   -v, --verbose         Enables verbose mode.
-   -r, --restrict-characters
-                         Removes '? < > \ : * | " ^' from file names
-   -c [PATH], --cookies [PATH]
-                         Optional cookies.txt file to load, can be used to allow the script to "log in" and circumvent age verification.
-   -o [PATH], --output-dir [PATH]
-                         Output directory where comics should be placed.
-                         If left blank, the script folder will be used. 
+    -h, --help            show this help message and exit
+    -f, --force           Disables updater.
+    -v, --verbose         Enables verbose mode.
+    -r, --restrict-characters
+                          Removes '? < > \ : * | " ^' from file names
+    -n, --organize        Organizes episodes into individual folders for comics.
+                          Currently incompatible with update mode, use selection instead.
+    -c [PATH], --cookies [PATH]
+                          Optional cookies.txt file to load, can be used to allow the script to "log in" and circumvent age verification.
+    -o [PATH], --output-dir [PATH]
+                          Output directory where comics should be placed.
+                          If left blank, the script folder will be used.
+    -s NUM NUM, --selection NUM NUM
+                          Select episodes (aka pages) inclusively by ID to download.
+                          -s 2191740 2191740 will download only that episode.
+                          -s 136372 2191740 will download all episodes between that range.
  ```
  * The script will create an folder with the name and urlName (`name [urlName]`) of the comic in the current shell location (like git) and download all images of the comic into it.
- * If the script finds an folder with the name of the comic, it will only update, this can be disabled with `-f/--force`.
+ * If the script finds an folder with the name of the comic, it will only update, this can be disabled with `-f/--force`. 
  * To get the verbose output use `-v/--verbose`.
+ * To move images into folders by episode use `-n/--organize`. Currently incompatible with updating, (delete and) use selection instead.
+ * To select the range of episodes to download, use `-s/--selection beginID endID`. Find the IDs by going into the episode and copying the number at the end of the URL.
  * To specify an base output path use `-o/--output-dir \desired\path` (If not specified, files and folders will be created where the script was run.)
- * On some file systems (expecialy Windows ones) some characters are unsupportet, if you run into problems with that use the -c, --restrict-characters option
+ * On some file systems (expecialy Windows ones) some characters are unsupported, if you run into problems with that use the -c, --restrict-characters option
diff --git a/tapas-dl.py b/tapas-dl.py
@@ -49,14 +49,17 @@ def check_path(path, slash=True, fat=False):
 # parse input and settup help
 parser = argparse.ArgumentParser(description='Downloads Comics from \'https://tapas.io\'.\nIf folder of downloaded comic is found, it will only update (can be disabled with -f/--force).', formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument('url', metavar='URL/name', type=str, nargs='+',
-                    help='URL or URL name to comic\nGo to the comic you want to download (any page)\nRightclick on the comic name in the upper left corner and select "Copy linkaddress" (Or similar) or just use the name behind series in the url\nExamples: https://tapas.io/series/Erma, RavenWolf, ...')
+                    help='URL or URL name to comic\nGo to the comic you want to download (any episode)\nRightclick on the comic name in the upper left corner and select "Copy linkaddress" (Or similar) or just use the name behind series in the url\nExamples: https://tapas.io/series/Erma, RavenWolf, ...')
 parser.add_argument('-f', '--force', action="store_true", help='Disables updater.')
 parser.add_argument('-v', '--verbose', action="store_true", help='Enables verbose mode.')
 parser.add_argument('-r', '--restrict-characters', action="store_true", help='Removes \'? < > \\ : * | " ^\' from file names')
+parser.add_argument('-n', '--organize', action="store_true", help='Organizes episodes into individual folders for comics. \nCurrently incompatible with update mode, use selection instead.')
 parser.add_argument('-c', '--cookies', type=str, nargs='?', default="", dest='cookies', metavar='PATH',
                     help='Optional cookies.txt file to load, can be used to allow the script to "log in" and circumvent age verification.')
 parser.add_argument('-o', '--output-dir', type=str, nargs='?', default="", dest='baseDir', metavar='PATH',
                     help='Output directory where comics should be placed.\nIf left blank, the script folder will be used.')
+parser.add_argument('-s', '--selection', type=int, nargs=2, default=[0,99999999], dest='episodeRange', metavar='NUM',
+                    help='Select episodes (aka pages) by ID to download.\n-s 2191740 2191740 will download only that episode.\n-s 136372 2191740 will download all episodes between that range.')
 
 args = parser.parse_args()
 
@@ -72,6 +75,10 @@ def check_path(path, slash=True, fat=False):
 if (args.baseDir):
     basePath = Path(args.baseDir)
 
+episodeRange=[0,99999999]
+if args.episodeRange:
+    episodeRange=args.episodeRange
+
 for urlCount, url in enumerate(args.url):
     # check url/name
     if re.match(r'^https://tapas\.io/series/.+$', url):
@@ -105,9 +112,11 @@ def check_path(path, slash=True, fat=False):
     page = pq(s.get(f'https://tapas.io/series/{seriesId}/episodes?page=1&sort=OLDEST&max_limit=99999999')  # It's over 9000! But I love that they forgot to limit the max_limit, because that means I don't have to bother with pagination ^^
               .json()['data']['body'])
     for episode in page('[data-permalink*="/episode/"]'):
-        data.append({'id': int(episode.attrib['data-permalink'][episode.attrib['data-permalink'].rfind('/') + 1:])})
+        tempID = int(episode.attrib['data-permalink'][episode.attrib['data-permalink'].rfind('/') + 1:])
+        if tempID >= episodeRange[0] and tempID <= episodeRange[1]:
+            data.append({'id': tempID})
 
-    printLine('{} [{}] ({} pages):'.format(name, urlName, len(data)))
+    printLine('{} [{}] ({} episodes)'.format(name, urlName, len(data)))
 
     # Check if folder exsists, if not create it
     printLine('Checking folder...', True)
@@ -117,16 +126,20 @@ def check_path(path, slash=True, fat=False):
         savePath = os.path.join(basePath, savePath)
         printLine('Full path is: ' + str(savePath))
     if os.path.isdir(savePath) and not args.force:
-        printLine('Found directory, only updating (use -f/--force to disable)')
+        if not args.organize:
+            printLine('Found directory, only updating (use -f/--force to disable)')
 
-        filesInDir = list(os.scandir(savePath))
+            filesInDir = list(os.scandir(savePath))
 
-        fileNames = []
-        for fileInDir in filesInDir:
-            fileNames.append(fileInDir.name)
-        fileNames.sort()
+            fileNames = []
+            for fileInDir in filesInDir:
+                fileNames.append(fileInDir.name)
+            fileNames.sort()
 
-        imgOffset = len(fileNames)
+            imgOffset = len(fileNames)
+        else:
+            printLine('\nWarning!!! Organize mode is incompatible with updates to episode folders.')
+            imgOffset = 0
 
         if imgOffset > 1:
             lastFile = fileNames[-1]
@@ -167,6 +180,8 @@ def check_path(path, slash=True, fat=False):
         # Get images from page from JS api
         allImgCount = 0
         for pageCount, pageData in enumerate(data):
+            # This is a hack, idk why it's suddenly off by 1
+            pageCount+=1
 
             # Test whether the page we have in mind is reachable
             pageReqest = s.get(f'https://tapas.io/episode/{pageData["id"]}')
@@ -181,11 +196,11 @@ def check_path(path, slash=True, fat=False):
 
             else:
                 # If the page did not yield an access error, go ahead an scrape for image entries.
-                pageHtml = pq(s.get(f'https://tapas.io/episode/{pageData["id"]}').content)
+                pageHtml = pq(pageReqest.content)
 
-                printLine('Downloaded image data from {} images (pages {}/{})...'.format(allImgCount, pageCount + pageOffset, len(data) + pageOffset), True)
+                pageData['title'] = pageHtml('.center-info__title.js-ep-title').text()
 
-                pageData['title'] = pageHtml('.info__title').text()
+                printLine('Downloaded image data from {}, {} images (pages {}/{})...'.format(pageData['title'], allImgCount, pageCount + pageOffset, len(data) + pageOffset), True)
 
                 pageData['imgs'] = []
                 for img in pageHtml('.content__img'):
@@ -196,23 +211,30 @@ def check_path(path, slash=True, fat=False):
         # Download images
         imgCount = 0
         for pageCount, pageData in enumerate(data):
+            # This is a hack, idk why it's suddenly off by 1
+            pageCount+=1
+            if args.organize:
+                episodePath = os.path.join(savePath, check_path('{} [{}]'.format(pageData['id'], pageData['title']), fat=args.restrict_characters))
+                os.mkdir(episodePath)
+            else:
+                episodePath = savePath
 
             for imgOfPageCount, img in enumerate(pageData['imgs']):
 
                 # Check if the first image entry is the fummy text that indicates the page was unavailable when we tried to scrape it.
                 if pageData['imgs'][0] != "PageUnavailable":
                     # If the entry isn't a dummy entry, go ahead and download the images it contains.
-                    with open(os.path.join(savePath, check_path('{} - {} - {} - {} - #{}.{}'.format(lead0(imgCount + imgOffset, allImgCount + imgOffset), lead0(pageCount + pageOffset, len(pageData) + pageOffset),
+                    with open(os.path.join(episodePath, check_path('{} - {} - {} - {} - #{}.{}'.format(lead0(imgCount + imgOffset, allImgCount + imgOffset), lead0(pageCount + pageOffset, len(pageData) + pageOffset),
                                                                                                     lead0(imgOfPageCount, len(pageData['imgs'])), pageData['title'], pageData['id'], img[img.rindex('.') + 1:]),
                                                                 fat=args.restrict_characters)), 'wb') as f:
                         f.write(s.get(img).content)
 
                     imgCount += 1
 
-                    printLine('Downloaded image {}/{} from page {}/{} ({}/{} images)...'.format(imgOfPageCount + 1, len(pageData['imgs']), pageCount + pageOffset, len(data) + pageOffset, imgCount + imgOffset, allImgCount + imgOffset), True)
+                    printLine('Downloaded image {}/{} from episode {}/{} ({}/{} images)...'.format(imgOfPageCount + 1, len(pageData['imgs']), pageCount + pageOffset, len(data) + pageOffset, imgCount + imgOffset, allImgCount + imgOffset), True)
                 else:
                     # If the entry was a dummy entry, skip it and let the user know.
-                    printLine('Error: No images downloaded from page {}/{}.'.format(pageCount + pageOffset, len(data) + pageOffset), True)
+                    printLine('Error: No images downloaded from episode {}/{}.'.format(pageCount + pageOffset, len(data) + pageOffset), True)
 
         if data != []:
             printLine('Downloaded {} of {} images'.format(imgCount, allImgCount))
@@ -221,6 +243,16 @@ def check_path(path, slash=True, fat=False):
 
         if urlCount + 1 != len(args.url):
             printLine()
+
+
+
+
+
+#####################################################################################################
+
+
+
+
     else:
         printLine('Detected novel')