Bug fix and more novels

- Added Read Light Novel which has thousands of novels but poor formatting - Changed Novelle Leggere chapter parser to lxml as html parser was unreliable
EternalTrail · May 6, 2020 · e71ecf7 · e71ecf7
1 parent 8308740
commit e71ecf7
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 16 deletions.
diff --git a/NovelDownloader.py b/NovelDownloader.py
@@ -4,7 +4,7 @@
 
 class NovelDownloader:
 
-    def generateBookFromToMulti(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None):
+    def generateBookFromToMulti(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None, bsParser = None):
 
         # Download and clean all of the chapters
         chapterLinks = parser.getNovelChapterLinks(novelName)
@@ -16,7 +16,10 @@ def generateBookFromToMulti(parser, novelName, startChapter, endChapter, customC
             def callback(id):
                 return False
         def downloadPage(link):
-            pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
+            if bsParser == None:
+                pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
+            else:
+                pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1], parser = bsParser)
             if callback(idnum):
                 raise RuntimeError("Process terminated")
         with ThreadPool(poolSize) as pool:
@@ -36,7 +39,7 @@ def downloadPage(link):
             EBookGenerator().generateEBook(chapters, novelName, "{}-{}".format(startChapter+1, endChapter+1), parser.novels[novelName][2], image)
 
 
-    def generateBookFromTo(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, idnum = None):
+    def generateBookFromTo(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, idnum = None, bsParser = None):
 
         # Download and clean all of the chapters
         chapterLinks = parser.getNovelChapterLinks(novelName)
@@ -47,7 +50,10 @@ def generateBookFromTo(parser, novelName, startChapter, endChapter, customCoverF
             def callback(id):
                 return False
         for link in chapterLinks[startChapter:endChapter+1]:
-            pot_of_soup.append(PageTools().getSoupFromUrl(link))
+            if bsParser == None:
+                pot_of_soup.append(PageTools().getSoupFromUrl(link))
+            else:
+                pot_of_soup.append(PageTools().getSoupFromUrl(link, parser = bsParser))
             callback(idnum)
         chapters = [parser.cleanChapter(soup) for soup in pot_of_soup]
 
@@ -64,7 +70,7 @@ def callback(id):
             EBookGenerator().generateEBook(chapters, novelName, "{}-{}".format(startChapter+1, endChapter+1), parser.novels[novelName][2], image)
 
 
-    def generateBookMulti(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None):
+    def generateBookMulti(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None, bsParser = None):
 
         # Download and clean all of the chapters in the book
         chapterLinks = parser.getNovelBookChapterLinks(novelName, bookName)
@@ -74,7 +80,10 @@ def generateBookMulti(parser, novelName, bookName, customCoverFilename = None, c
             def callback(id):
                 return False
         def downloadPage(link):
-            pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
+            if bsParser == None:
+                pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
+            else:
+                pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1], parser = bsParser)
             if callback(idnum):
                 raise RuntimeError("Process terminated")
         with ThreadPool(poolSize) as pool:
@@ -94,7 +103,7 @@ def downloadPage(link):
             EBookGenerator().generateEBook(chapters, novelName, bookName, parser.novels[novelName][2], image)
 
 
-    def generateBook(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, idnum = None):
+    def generateBook(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, idnum = None, bsParser = None):
 
         # Download and clean all of the chapters in the book
         chapterLinks = parser.getNovelBookChapterLinks(novelName, bookName)
@@ -103,7 +112,10 @@ def generateBook(parser, novelName, bookName, customCoverFilename = None, custom
             def callback(id):
                 return False
         for link in chapterLinks:
-            pot_of_soup.append(PageTools().getSoupFromUrl(link))
+            if bsParser == None:
+                pot_of_soup.append(PageTools().getSoupFromUrl(link))
+            else:
+                pot_of_soup.append(PageTools().getSoupFromUrl(link, parser = bsParser))
             callback(idnum)
         chapters = [parser.cleanChapter(soup) for soup in pot_of_soup]
 
@@ -120,13 +132,13 @@ def callback(id):
             EBookGenerator().generateEBook(chapters, novelName, bookName, parser.novels[novelName][2], image)
 
 
-    def generateBooks(parser, novelName, bookNames, customCoverFilename = None, customBookNames = None, callback=None, idnum = None):
+    def generateBooks(parser, novelName, bookNames, customCoverFilename = None, customBookNames = None, callback=None, idnum = None, bsParser = None):
 
         # If no custom book names are input, fill the input list with None objects
         if customBookNames == None:
             customBookNames = [None] * len(bookNames)
 
         # Download each book separately
         for bookName, customBookName in zip(bookNames, customBookNames):
-            self.generateBook(parser, novelName, bookNames, customCoverFilename, customBookName, callback, idnum)
+            self.generateBook(parser, novelName, bookNames, customCoverFilename, customBookName, callback, idnum, bsParser)
 
diff --git a/NovelParsers.py b/NovelParsers.py
@@ -21,6 +21,7 @@ def __init__(self):
         self.novelNames = None
         self.novelSypnoses = None
         self.isLoaded = False
+        self.bsParser = "html.parser"
 
         # Container for all novels that are requested
         self.novelLibrary = {}
@@ -250,6 +251,7 @@ def __init__(self):
         self.novelNames = []
         #self.novelSypnoses = None
         self.isLoaded = False
+        self.bsParser = "html.parser"
 
         # Container for all novels that are requested
         self.novelLibrary = {}
@@ -414,6 +416,7 @@ def __init__(self):
         self.novelNames = None
         self.novelSypnoses = None
         self.isLoaded = False
+        self.bsParser = "html.parser"
 
         # Container for all novels that are requested
         self.novelLibrary = {}
@@ -593,6 +596,7 @@ def __init__(self):
         self.novelNames = None
         # self.novelSypnoses = None
         self.isLoaded = False
+        self.bsParser = "html.parser"
 
         # Container for all novels that are requested
         self.novelLibrary = {}
@@ -749,6 +753,7 @@ def __init__(self):
         self.novelNames = None
         # self.novelSypnoses = None
         self.isLoaded = False
+        self.bsParser = "lxml"
 
         # Container for all novels that are requested
         self.novelLibrary = {}
@@ -885,7 +890,8 @@ def getNovelBookChapterNames(self, novelName, bookName):
     def cleanChapter(self, soup):
 
         hasSpoiler = None
-
+        # print(soup)
+        # soup = BeautifulSoup(str(soup), 'lxml')
         # Extract the chapter title and the chapter content
         chapterTitle = soup.find(class_="entry-title fusion-post-title").string
         content = soup.find(class_="post-content")
@@ -914,5 +920,166 @@ def cleanChapter(self, soup):
         return BeautifulSoup(chapter, "html.parser")
 
 
+class ReadLightNovelParser:
+
+    def __init__(self):
+
+        self.url = "https://www.readlightnovel.org/"
+        self.name = "Read Light Novel"
+
+
+        # Create containers
+        self.novels = {}
+        self.novelNames = None
+        self.novelSypnoses = {}
+        self.isLoaded = False
+        self.bsParser = "html.parser"
+
+        # Container for all novels that are requested
+        self.novelLibrary = {}
+
+
+    def load(self):
+        if not self.isLoaded:
+            self.parseNovelList()
+            self.isLoaded = True
+
+
+    def clearNovelCache(self):
+        self.novelLibrary = {}
+
+
+    def parseNovelList(self):
+
+        soup = PageTools().getSoupFromUrl(self.url+"novel-list")
+        books = PageTools().getElementsFromSoup(soup,[{"class_":"col-lg-12"},{"class_":"list-by-word-body"},"li"])
+
+        for book in books:
+            if PageTools().getElementsFromSoup(book, ["a"])[0]['href'] == "#":
+                continue
+            linkTitle = PageTools().getElementsFromSoup(book, [{"data-toggle":"popover"}])[0]
+            self.novels[linkTitle.string] = [linkTitle['href'], PageTools().getElementsFromSoup(book, ["img"])[0]['src'], "N/A"]
+            self.novelSypnoses[linkTitle.string] = PageTools().getElementsFromSoup(book, [{"class_":"pop-summary"}], onlyText=True)[0]
+
+        self.novelNames = list(self.novels.keys())
+        self.novelNames.sort()
+
+
+    def loadNovelInfo(self, novelName):
+
+        if novelName in self.novelLibrary.keys():
+            return
+
+        # Load the webpage for the novel
+        soup = PageTools().getSoupFromUrl(self.novels[novelName][0])
+
+        # Download cover image
+        try:
+            coverImage = PageTools().downloadPage(self.novels[novelName][1])
+        except:
+            coverImage = PageTools().downloadPage(noCoverLink)
+
+        # Parse all of the book names/sections
+        bookTitles = PageTools().getElementsFromSoup(soup, [{"id":"accordion"},{"class_":"panel-title"}], onlyText = True)
+
+        # Create an empty dictionary to store all chapter names and links
+        chapterLibrary = []
+        bookToC = {}
+        for i, bookTitle in enumerate(bookTitles):
+
+            # Extract the html containing the chapter links and names
+            chapterInfo = PageTools().getElementsFromSoup(soup,[{"id":"collapse-{}".format(i+1)},{"class_":"chapter-chs"},"a"])
+
+            # Extract the chapter links and names
+            chapterInfo = [[chap['href'], bookTitle+", "+chap.string.replace("<",'').replace(">",'')] for chap in chapterInfo]
+
+            # Store chapters for each book
+            bookToC[re.sub("\n", "", bookTitle)] = chapterInfo
+            chapterLibrary.extend(bookToC[bookTitle])
+
+        # Add the books, chapters, and the cover to the novel library
+        self.novelLibrary[novelName] = [bookTitles, chapterLibrary, bookToC, coverImage]
+
+
+    def getNovelNames(self):
+
+        self.load()
+        return self.novelNames
+
+
+    def getImageBinary(self, novelName):
+
+        self.loadNovelInfo(novelName)
+        return self.novelLibrary[novelName][3]
+
+
+    def getImagePillow(self, novelName):
+
+        return Image.open(BytesIO(self.getImageBinary(novelName)))
+
+
+    def getNovelBookNames(self, novelName):
+
+        self.loadNovelInfo(novelName)
+        return self.novelLibrary[novelName][0]
+
+
+    def getNovelChapterLinks(self, novelName):
+
+        self.loadNovelInfo(novelName)
+        return [chapter[0] for chapter in self.novelLibrary[novelName][1]]
+
+
+    def getNovelChapterNames(self, novelName):
+
+        self.loadNovelInfo(novelName)
+        return [chapter[1] for chapter in self.novelLibrary[novelName][1]]
+
+
+    def getNovelBookChapterLinks(self, novelName, bookName):
+
+        self.loadNovelInfo(novelName)
+        return [chapter[0] for chapter in self.novelLibrary[novelName][2][bookName]]
+
+
+    def getNovelBookChapterNames(self, novelName, bookName):
+
+        self.loadNovelInfo(novelName)
+        return [chapter[1] for chapter in self.novelLibrary[novelName][2][bookName]]
+
+
+    def cleanChapter(self, soup):
+
+        # Extract the chapter title and the chapter content
+        content = soup.find(class_="desc")
+        # Remove characters that might corrupt the ebook file
+        chapterTitle = re.search("Chapter \d*", content.decode_contents()).group()
+
+        elements = ["ads-title","apester-element"]
+
+        for element in elements:
+            for script in content.find_all(class_=element):
+                script.decompose()
+
+        for a in content.find_all("a"):
+            a.decompose()
+
+        for hr in content.find_all("hr"):
+            hr.decompose()
+
+        for script in content.find_all("script"):
+            script.decompose()
+
+        for div in content.find_all("div"):
+            div.decompose()
+
+        # Add html header to the chapter
+        chapter = '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>{0}</title>\n</head>\n<body>\n<h1>{0}</h1>\n'.format(chapterTitle)
+        chapter += re.sub(" \.", ".", content.decode_contents()).strip("\n"+chapterTitle)
+
+        # Return the chapter as a BeautifulSoup html object
+        return BeautifulSoup(chapter, "html.parser")
+
+
 if __name__ == "__main__":
     pass
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 
 # Novel 2 E-Book
-This Python program will download books and chapters from novels availaible on several websites and saves them as .epub ebooks. About 180 books available as of the latest release (newly added novels should also show up without requiring any updates to the program!).
+This Python program will download books and chapters from novels availaible on several websites and saves them as .epub ebooks. About **200** books as of the latest release from direct sources with *__several thousand__* available from Read Light Novel site (newly added novels should also show up without requiring any updates to the program!).
 
-Visit [novel-ebook.com](https://novel-ebook.com) for a webapp with the same functionality but with over 2000 supported novels made by [MakeYourLifeEasier](https://github.com/MakeYourLifeEasier). Beware this site is still in beta and you may experience problems. 
+Visit [novel-ebook.com](https://novel-ebook.com) for a webapp with the same functionality made by [MakeYourLifeEasier](https://github.com/MakeYourLifeEasier).
 
 ## Getting Started
 
@@ -16,6 +16,7 @@ To run this script you'll need to have Python 3.x.x installed which you can find
     - [Volare Novels](https://www.volarenovels.com)
     - [Totally Translations](https://totallytranslations.com/)
 	- [Novelle Leggere](https://www.novelleleggere.com/)
+	- [Read Light Novel](https://www.readlightnovel.org/) *initial load time for this is slow due to the sheer amount of novels* **also, poor quality formatting**
 - Automatically adds some metadata like title and cover
 - Concurrent download of chapters - *significantly* faster download of books than just downloading them one by one (I'm talking an order or 2 of magnitudes faster)
 
@@ -39,7 +40,7 @@ path/where/you/installed/python.exe novel2ebook.py
 
 ### Prerequisites
 
-As mentioned before this script was written for Python version 3.7.x. It may work with other versions too but none are tested.
+As mentioned before this script was written for Python version 3.x.x. It may work with other versions too but none are tested.
 Additionally the Python image library (Pillow), lxml, and BeautifulSoup4 are required.
 To install all dependencies just use the console to navigate into the project folder and write
 

diff --git a/novel2ebook.py b/novel2ebook.py
@@ -184,15 +184,15 @@ def onDownloadButtonClick(self):
             self.TKW.guiElements["ProgressBar"].config(mode="determinate", maximum = endChapter-startChapter+1, value = 0)
             self.newThread = threading.Thread(target=NovelDownloader.generateBookFromToMulti, args=(self.selectedParser,\
                                               novel, startChapter, endChapter), kwargs={"callback":self.updateProgresstrack,\
-                                              "poolSize":self.poolSize, "idnum":self.progressTrackID})
+                                              "poolSize":self.poolSize, "idnum":self.progressTrackID, "bsParser":self.selectedParser.bsParser})
             self.newThread.start()
             #NovelDownloader.generateBookFromToMulti(self.selectedParser, novel, startChapter, endChapter, callback = self.updateProgresstrack)
         else:
             chapterList = self.selectedParser.getNovelBookChapterLinks(novel, self.TKW.guiElements["BookCombobox"][0].get())
             self.TKW.guiElements["ProgressBar"].config(mode="determinate", maximum = len(chapterList), value = 0)
             self.newThread = threading.Thread(target=NovelDownloader.generateBookMulti, args=(self.selectedParser,\
                                               novel, self.TKW.guiElements["BookCombobox"][0].get()), kwargs={"callback":self.updateProgresstrack,\
-                                              "poolSize":self.poolSize, "idnum":self.progressTrackID})
+                                              "poolSize":self.poolSize, "idnum":self.progressTrackID, "bsParser":self.selectedParser.bsParser})
             self.newThread.start()
             #NovelDownloader.generateBook(self.selectedParser, novel, self.TKW.guiElements["BookCombobox"][0].get(), callback = self.updateProgresstrack)