Skip to content

Commit

Permalink
Bug fix and more novels
Browse files Browse the repository at this point in the history
- Added Read Light Novel which has thousands of novels but poor formatting
- Changed Novelle Leggere chapter parser to lxml as html parser was unreliable
  • Loading branch information
EternalTrail authored May 6, 2020
1 parent 8308740 commit e71ecf7
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 16 deletions.
32 changes: 22 additions & 10 deletions NovelDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class NovelDownloader:

def generateBookFromToMulti(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None):
def generateBookFromToMulti(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None, bsParser = None):

# Download and clean all of the chapters
chapterLinks = parser.getNovelChapterLinks(novelName)
Expand All @@ -16,7 +16,10 @@ def generateBookFromToMulti(parser, novelName, startChapter, endChapter, customC
def callback(id):
return False
def downloadPage(link):
pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
if bsParser == None:
pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
else:
pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1], parser = bsParser)
if callback(idnum):
raise RuntimeError("Process terminated")
with ThreadPool(poolSize) as pool:
Expand All @@ -36,7 +39,7 @@ def downloadPage(link):
EBookGenerator().generateEBook(chapters, novelName, "{}-{}".format(startChapter+1, endChapter+1), parser.novels[novelName][2], image)


def generateBookFromTo(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, idnum = None):
def generateBookFromTo(parser, novelName, startChapter, endChapter, customCoverFilename = None, customBookName = None, callback=None, idnum = None, bsParser = None):

# Download and clean all of the chapters
chapterLinks = parser.getNovelChapterLinks(novelName)
Expand All @@ -47,7 +50,10 @@ def generateBookFromTo(parser, novelName, startChapter, endChapter, customCoverF
def callback(id):
return False
for link in chapterLinks[startChapter:endChapter+1]:
pot_of_soup.append(PageTools().getSoupFromUrl(link))
if bsParser == None:
pot_of_soup.append(PageTools().getSoupFromUrl(link))
else:
pot_of_soup.append(PageTools().getSoupFromUrl(link, parser = bsParser))
callback(idnum)
chapters = [parser.cleanChapter(soup) for soup in pot_of_soup]

Expand All @@ -64,7 +70,7 @@ def callback(id):
EBookGenerator().generateEBook(chapters, novelName, "{}-{}".format(startChapter+1, endChapter+1), parser.novels[novelName][2], image)


def generateBookMulti(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None):
def generateBookMulti(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, poolSize = 50, idnum = None, bsParser = None):

# Download and clean all of the chapters in the book
chapterLinks = parser.getNovelBookChapterLinks(novelName, bookName)
Expand All @@ -74,7 +80,10 @@ def generateBookMulti(parser, novelName, bookName, customCoverFilename = None, c
def callback(id):
return False
def downloadPage(link):
pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
if bsParser == None:
pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1])
else:
pot_of_soup[link[0]] = PageTools().getSoupFromUrl(link[1], parser = bsParser)
if callback(idnum):
raise RuntimeError("Process terminated")
with ThreadPool(poolSize) as pool:
Expand All @@ -94,7 +103,7 @@ def downloadPage(link):
EBookGenerator().generateEBook(chapters, novelName, bookName, parser.novels[novelName][2], image)


def generateBook(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, idnum = None):
def generateBook(parser, novelName, bookName, customCoverFilename = None, customBookName = None, callback=None, idnum = None, bsParser = None):

# Download and clean all of the chapters in the book
chapterLinks = parser.getNovelBookChapterLinks(novelName, bookName)
Expand All @@ -103,7 +112,10 @@ def generateBook(parser, novelName, bookName, customCoverFilename = None, custom
def callback(id):
return False
for link in chapterLinks:
pot_of_soup.append(PageTools().getSoupFromUrl(link))
if bsParser == None:
pot_of_soup.append(PageTools().getSoupFromUrl(link))
else:
pot_of_soup.append(PageTools().getSoupFromUrl(link, parser = bsParser))
callback(idnum)
chapters = [parser.cleanChapter(soup) for soup in pot_of_soup]

Expand All @@ -120,13 +132,13 @@ def callback(id):
EBookGenerator().generateEBook(chapters, novelName, bookName, parser.novels[novelName][2], image)


def generateBooks(parser, novelName, bookNames, customCoverFilename = None, customBookNames = None, callback=None, idnum = None):
def generateBooks(parser, novelName, bookNames, customCoverFilename = None, customBookNames = None, callback=None, idnum = None, bsParser = None):

# If no custom book names are input, fill the input list with None objects
if customBookNames == None:
customBookNames = [None] * len(bookNames)

# Download each book separately
for bookName, customBookName in zip(bookNames, customBookNames):
self.generateBook(parser, novelName, bookNames, customCoverFilename, customBookName, callback, idnum)
self.generateBook(parser, novelName, bookNames, customCoverFilename, customBookName, callback, idnum, bsParser)

169 changes: 168 additions & 1 deletion NovelParsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(self):
self.novelNames = None
self.novelSypnoses = None
self.isLoaded = False
self.bsParser = "html.parser"

# Container for all novels that are requested
self.novelLibrary = {}
Expand Down Expand Up @@ -250,6 +251,7 @@ def __init__(self):
self.novelNames = []
#self.novelSypnoses = None
self.isLoaded = False
self.bsParser = "html.parser"

# Container for all novels that are requested
self.novelLibrary = {}
Expand Down Expand Up @@ -414,6 +416,7 @@ def __init__(self):
self.novelNames = None
self.novelSypnoses = None
self.isLoaded = False
self.bsParser = "html.parser"

# Container for all novels that are requested
self.novelLibrary = {}
Expand Down Expand Up @@ -593,6 +596,7 @@ def __init__(self):
self.novelNames = None
# self.novelSypnoses = None
self.isLoaded = False
self.bsParser = "html.parser"

# Container for all novels that are requested
self.novelLibrary = {}
Expand Down Expand Up @@ -749,6 +753,7 @@ def __init__(self):
self.novelNames = None
# self.novelSypnoses = None
self.isLoaded = False
self.bsParser = "lxml"

# Container for all novels that are requested
self.novelLibrary = {}
Expand Down Expand Up @@ -885,7 +890,8 @@ def getNovelBookChapterNames(self, novelName, bookName):
def cleanChapter(self, soup):

hasSpoiler = None

# print(soup)
# soup = BeautifulSoup(str(soup), 'lxml')
# Extract the chapter title and the chapter content
chapterTitle = soup.find(class_="entry-title fusion-post-title").string
content = soup.find(class_="post-content")
Expand Down Expand Up @@ -914,5 +920,166 @@ def cleanChapter(self, soup):
return BeautifulSoup(chapter, "html.parser")


class ReadLightNovelParser:

def __init__(self):

self.url = "https://www.readlightnovel.org/"
self.name = "Read Light Novel"


# Create containers
self.novels = {}
self.novelNames = None
self.novelSypnoses = {}
self.isLoaded = False
self.bsParser = "html.parser"

# Container for all novels that are requested
self.novelLibrary = {}


def load(self):
if not self.isLoaded:
self.parseNovelList()
self.isLoaded = True


def clearNovelCache(self):
self.novelLibrary = {}


def parseNovelList(self):

soup = PageTools().getSoupFromUrl(self.url+"novel-list")
books = PageTools().getElementsFromSoup(soup,[{"class_":"col-lg-12"},{"class_":"list-by-word-body"},"li"])

for book in books:
if PageTools().getElementsFromSoup(book, ["a"])[0]['href'] == "#":
continue
linkTitle = PageTools().getElementsFromSoup(book, [{"data-toggle":"popover"}])[0]
self.novels[linkTitle.string] = [linkTitle['href'], PageTools().getElementsFromSoup(book, ["img"])[0]['src'], "N/A"]
self.novelSypnoses[linkTitle.string] = PageTools().getElementsFromSoup(book, [{"class_":"pop-summary"}], onlyText=True)[0]

self.novelNames = list(self.novels.keys())
self.novelNames.sort()


def loadNovelInfo(self, novelName):

if novelName in self.novelLibrary.keys():
return

# Load the webpage for the novel
soup = PageTools().getSoupFromUrl(self.novels[novelName][0])

# Download cover image
try:
coverImage = PageTools().downloadPage(self.novels[novelName][1])
except:
coverImage = PageTools().downloadPage(noCoverLink)

# Parse all of the book names/sections
bookTitles = PageTools().getElementsFromSoup(soup, [{"id":"accordion"},{"class_":"panel-title"}], onlyText = True)

# Create an empty dictionary to store all chapter names and links
chapterLibrary = []
bookToC = {}
for i, bookTitle in enumerate(bookTitles):

# Extract the html containing the chapter links and names
chapterInfo = PageTools().getElementsFromSoup(soup,[{"id":"collapse-{}".format(i+1)},{"class_":"chapter-chs"},"a"])

# Extract the chapter links and names
chapterInfo = [[chap['href'], bookTitle+", "+chap.string.replace("<",'').replace(">",'')] for chap in chapterInfo]

# Store chapters for each book
bookToC[re.sub("\n", "", bookTitle)] = chapterInfo
chapterLibrary.extend(bookToC[bookTitle])

# Add the books, chapters, and the cover to the novel library
self.novelLibrary[novelName] = [bookTitles, chapterLibrary, bookToC, coverImage]


def getNovelNames(self):

self.load()
return self.novelNames


def getImageBinary(self, novelName):

self.loadNovelInfo(novelName)
return self.novelLibrary[novelName][3]


def getImagePillow(self, novelName):

return Image.open(BytesIO(self.getImageBinary(novelName)))


def getNovelBookNames(self, novelName):

self.loadNovelInfo(novelName)
return self.novelLibrary[novelName][0]


def getNovelChapterLinks(self, novelName):

self.loadNovelInfo(novelName)
return [chapter[0] for chapter in self.novelLibrary[novelName][1]]


def getNovelChapterNames(self, novelName):

self.loadNovelInfo(novelName)
return [chapter[1] for chapter in self.novelLibrary[novelName][1]]


def getNovelBookChapterLinks(self, novelName, bookName):

self.loadNovelInfo(novelName)
return [chapter[0] for chapter in self.novelLibrary[novelName][2][bookName]]


def getNovelBookChapterNames(self, novelName, bookName):

self.loadNovelInfo(novelName)
return [chapter[1] for chapter in self.novelLibrary[novelName][2][bookName]]


def cleanChapter(self, soup):

# Extract the chapter title and the chapter content
content = soup.find(class_="desc")
# Remove characters that might corrupt the ebook file
chapterTitle = re.search("Chapter \d*", content.decode_contents()).group()

elements = ["ads-title","apester-element"]

for element in elements:
for script in content.find_all(class_=element):
script.decompose()

for a in content.find_all("a"):
a.decompose()

for hr in content.find_all("hr"):
hr.decompose()

for script in content.find_all("script"):
script.decompose()

for div in content.find_all("div"):
div.decompose()

# Add html header to the chapter
chapter = '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>{0}</title>\n</head>\n<body>\n<h1>{0}</h1>\n'.format(chapterTitle)
chapter += re.sub(" \.", ".", content.decode_contents()).strip("\n"+chapterTitle)

# Return the chapter as a BeautifulSoup html object
return BeautifulSoup(chapter, "html.parser")


if __name__ == "__main__":
pass
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

# Novel 2 E-Book
This Python program will download books and chapters from novels availaible on several websites and saves them as .epub ebooks. About 180 books available as of the latest release (newly added novels should also show up without requiring any updates to the program!).
This Python program will download books and chapters from novels availaible on several websites and saves them as .epub ebooks. About **200** books as of the latest release from direct sources with *__several thousand__* available from Read Light Novel site (newly added novels should also show up without requiring any updates to the program!).

Visit [novel-ebook.com](https://novel-ebook.com) for a webapp with the same functionality but with over 2000 supported novels made by [MakeYourLifeEasier](https://github.com/MakeYourLifeEasier). Beware this site is still in beta and you may experience problems.
Visit [novel-ebook.com](https://novel-ebook.com) for a webapp with the same functionality made by [MakeYourLifeEasier](https://github.com/MakeYourLifeEasier).

## Getting Started

Expand All @@ -16,6 +16,7 @@ To run this script you'll need to have Python 3.x.x installed which you can find
- [Volare Novels](https://www.volarenovels.com)
- [Totally Translations](https://totallytranslations.com/)
- [Novelle Leggere](https://www.novelleleggere.com/)
- [Read Light Novel](https://www.readlightnovel.org/) *initial load time for this is slow due to the sheer amount of novels* **also, poor quality formatting**
- Automatically adds some metadata like title and cover
- Concurrent download of chapters - *significantly* faster download of books than just downloading them one by one (I'm talking an order or 2 of magnitudes faster)

Expand All @@ -39,7 +40,7 @@ path/where/you/installed/python.exe novel2ebook.py

### Prerequisites

As mentioned before this script was written for Python version 3.7.x. It may work with other versions too but none are tested.
As mentioned before this script was written for Python version 3.x.x. It may work with other versions too but none are tested.
Additionally the Python image library (Pillow), lxml, and BeautifulSoup4 are required.
To install all dependencies just use the console to navigate into the project folder and write

Expand Down
4 changes: 2 additions & 2 deletions novel2ebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ def onDownloadButtonClick(self):
self.TKW.guiElements["ProgressBar"].config(mode="determinate", maximum = endChapter-startChapter+1, value = 0)
self.newThread = threading.Thread(target=NovelDownloader.generateBookFromToMulti, args=(self.selectedParser,\
novel, startChapter, endChapter), kwargs={"callback":self.updateProgresstrack,\
"poolSize":self.poolSize, "idnum":self.progressTrackID})
"poolSize":self.poolSize, "idnum":self.progressTrackID, "bsParser":self.selectedParser.bsParser})
self.newThread.start()
#NovelDownloader.generateBookFromToMulti(self.selectedParser, novel, startChapter, endChapter, callback = self.updateProgresstrack)
else:
chapterList = self.selectedParser.getNovelBookChapterLinks(novel, self.TKW.guiElements["BookCombobox"][0].get())
self.TKW.guiElements["ProgressBar"].config(mode="determinate", maximum = len(chapterList), value = 0)
self.newThread = threading.Thread(target=NovelDownloader.generateBookMulti, args=(self.selectedParser,\
novel, self.TKW.guiElements["BookCombobox"][0].get()), kwargs={"callback":self.updateProgresstrack,\
"poolSize":self.poolSize, "idnum":self.progressTrackID})
"poolSize":self.poolSize, "idnum":self.progressTrackID, "bsParser":self.selectedParser.bsParser})
self.newThread.start()
#NovelDownloader.generateBook(self.selectedParser, novel, self.TKW.guiElements["BookCombobox"][0].get(), callback = self.updateProgresstrack)

Expand Down

0 comments on commit e71ecf7

Please sign in to comment.