From 063c57a6e8e455f37c7a638591553bf17984d5b7 Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Thu, 24 Oct 2024 15:24:03 +0200 Subject: [PATCH 1/7] added epub support --- paper2remarkable/providers/__init__.py | 3 +++ paper2remarkable/ui.py | 24 +++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 7adc63a..04e91f2 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from paper2remarkable.providers.epub import EPUBProvider from .acl import ACL from .acm import ACM from .arxiv import Arxiv @@ -19,6 +20,7 @@ from .pubmed import PubMed from .semantic_scholar import SemanticScholar from .springer import Springer +from .epub import EPUBProvider # # The following providers are no longer functional due to Cloudflare blocking # # automated access, and have therefore been removed from the list of providers @@ -49,4 +51,5 @@ LocalFile, PdfUrl, HTML, + EPUBProvider ] diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 065ac9f..34e52a3 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -19,7 +19,7 @@ from . import __version__ from .exceptions import InvalidURLError from .exceptions import UnidentifiedSourceError -from .providers import LocalFile +from .providers import LocalFile, EPUBProvider from .providers import providers from .utils import follow_redirects from .utils import is_url @@ -27,8 +27,13 @@ def build_argument_parser(): parser = argparse.ArgumentParser( - description="Paper2reMarkable version %s" % __version__ + description="Paper2reMarkable version %s - Upload PDFs and EPUBs to reMarkable" % __version__ ) + parser.add_argument( + "input", + help="One or more URLs to a paper or paths to local PDF/EPUB files", + nargs="?", + ) parser.add_argument( "-b", "--blank", @@ -184,17 +189,22 @@ def choose_provider(cli_input): Raised when the input *is* a valid url, but no provider can handle it. """ + provider = cookiejar = None - if LocalFile.validate(cli_input): - # input is a local file + + # Check if it's a local file first + if os.path.exists(cli_input): new_input = cli_input - provider = LocalFile + # If it's an epub, use EPUBProvider + if cli_input.lower().endswith('.epub'): + provider = EPUBProvider + # Otherwise use LocalFile for PDFs + else: + provider = LocalFile elif is_url(cli_input): - # input is a url new_input, cookiejar = follow_redirects(cli_input) provider = next((p for p in providers if p.validate(new_input)), None) else: - # not a proper URL or non-existent file raise UnidentifiedSourceError if provider is None: From ca21571e4cfc0e2f7f09ff107bb8bae3b4e5dcec Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Thu, 24 Oct 2024 15:26:26 +0200 Subject: [PATCH 2/7] forgot to add epub file --- paper2remarkable/providers/epub.py | 65 ++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 paper2remarkable/providers/epub.py diff --git a/paper2remarkable/providers/epub.py b/paper2remarkable/providers/epub.py new file mode 100644 index 0000000..86294db --- /dev/null +++ b/paper2remarkable/providers/epub.py @@ -0,0 +1,65 @@ +from ._base import Provider +from ..utils import chdir, upload_to_remarkable +import os, tempfile, shutil + +class EPUBProvider(Provider): + """Provider for direct EPUB uploads""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Override operations since we don't need PDF processing + self.operations = [] # No operations needed for direct EPUB upload + + @staticmethod + def validate(src): + """Validate if source is an EPUB file""" + # Convert to absolute path before validation + abs_path = os.path.abspath(os.path.expanduser(src)) + return abs_path.lower().endswith('.epub') and os.path.exists(abs_path) + + + def get_abs_pdf_urls(self, src): + """For EPUB files, just return the local path as absolute path""" + abs_path = os.path.abspath(os.path.expanduser(src)) + return abs_path, abs_path + + + def run(self, src, filename=None): + """Override run method to handle EPUB files directly""" + # Convert to absolute path + src = os.path.abspath(os.path.expanduser(src)) + + if not self.validate(src): + raise ValueError("Source must be a valid EPUB file") + + # Generate filename if not provided + clean_filename = filename or os.path.basename(src) + if not clean_filename.endswith('.epub'): + clean_filename += '.epub' + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: + with chdir(working_dir): + # Simply copy the EPUB file + shutil.copy(src, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return upload_to_remarkable( + clean_filename, + remarkable_dir=self.remarkable_dir, + rmapi_path=self.rmapi_path, + ) + + # If not uploading, copy to target directory + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.epub" + shutil.move(clean_filename, target_path) + + return target_path From d5c2e173d919aec6c4a23de36e62033a161d1f62 Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Thu, 14 Nov 2024 23:32:06 +0100 Subject: [PATCH 3/7] moved epub inside localfilehandler --- paper2remarkable/providers/__init__.py | 3 -- paper2remarkable/providers/_base.py | 56 ++++++++++++++++------ paper2remarkable/providers/epub.py | 65 -------------------------- paper2remarkable/ui.py | 24 +++------- 4 files changed, 49 insertions(+), 99 deletions(-) delete mode 100644 paper2remarkable/providers/epub.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 04e91f2..7adc63a 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -from paper2remarkable.providers.epub import EPUBProvider from .acl import ACL from .acm import ACM from .arxiv import Arxiv @@ -20,7 +19,6 @@ from .pubmed import PubMed from .semantic_scholar import SemanticScholar from .springer import Springer -from .epub import EPUBProvider # # The following providers are no longer functional due to Cloudflare blocking # # automated access, and have therefore been removed from the list of providers @@ -51,5 +49,4 @@ LocalFile, PdfUrl, HTML, - EPUBProvider ] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 04a4925..df4cad5 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -34,6 +34,7 @@ class Provider(metaclass=abc.ABCMeta): """ABC for providers of pdf sources""" + SUPPORTED_FORMATS = ["pdf", "ps", "epub"] def __init__( self, verbose=False, @@ -77,20 +78,41 @@ def __init__( logger.disable() # Define the operations to run on the pdf. Providers can add others. - self.operations = [("rewrite", self.rewrite_pdf)] - if crop == "center": - self.operations.append(("center", self.center_pdf)) - elif crop == "right": - self.operations.append(("right", self.right_pdf)) - elif crop == "left": - self.operations.append(("crop", self.crop_pdf)) + self.operations = { + format: [] for format in self.SUPPORTED_FORMATS + } + self._configure_operations(crop, blank) + logger.info("Starting %s provider" % type(self).__name__) - if blank: - self.operations.append(("blank", blank_pdf)) - self.operations.append(("shrink", self.shrink_pdf)) + def _configure_operations(self, crop, blank): + """Configure operations for PDF and PS formats""" + # Formats that need PDF processing + pdf_formats = ['pdf', 'ps'] + def add_operation(formats, operation_name, operation_func): + for fmt in formats: + self.operations[fmt].append((operation_name, operation_func)) - logger.info("Starting %s provider" % type(self).__name__) + # Base operations + add_operation(pdf_formats, "rewrite", self.rewrite_pdf) + + # Crop operations mapping + crop_operations = { + 'center': ('center', self.center_pdf), + 'right': ('right', self.right_pdf), + 'left': ('crop', self.crop_pdf) + } + + # Add crop operation if specified + if crop in crop_operations: + add_operation(pdf_formats, *crop_operations[crop]) + + # Add blank operation if specified + if blank: + add_operation(pdf_formats, "blank", blank_pdf) + + # PDF-specific shrink operation + add_operation(['pdf'], "shrink", self.shrink_pdf) @staticmethod @abc.abstractmethod @@ -210,17 +232,23 @@ def run(self, src, filename=None): # generate nice filename if needed clean_filename = filename or self.informer.get_filename(abs_url) - tmp_filename = "paper.pdf" + extension = clean_filename.split(".")[-1] + tmp_filename = f"paper.{extension}" + + if extension not in self.SUPPORTED_FORMATS: + raise ValueError(f"Unsupported file format {extension}. Must be one of {self.SUPPORTED_FORMATS}") + self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: with chdir(working_dir): self.retrieve_pdf(pdf_url, tmp_filename) - assert_file_is_pdf(tmp_filename) + if extension in "pdf ps".split(): + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename - for opname, op in self.operations: + for opname, op in self.operations[extension]: intermediate_fname = op(intermediate_fname) shutil.copy(intermediate_fname, clean_filename) diff --git a/paper2remarkable/providers/epub.py b/paper2remarkable/providers/epub.py deleted file mode 100644 index 86294db..0000000 --- a/paper2remarkable/providers/epub.py +++ /dev/null @@ -1,65 +0,0 @@ -from ._base import Provider -from ..utils import chdir, upload_to_remarkable -import os, tempfile, shutil - -class EPUBProvider(Provider): - """Provider for direct EPUB uploads""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Override operations since we don't need PDF processing - self.operations = [] # No operations needed for direct EPUB upload - - @staticmethod - def validate(src): - """Validate if source is an EPUB file""" - # Convert to absolute path before validation - abs_path = os.path.abspath(os.path.expanduser(src)) - return abs_path.lower().endswith('.epub') and os.path.exists(abs_path) - - - def get_abs_pdf_urls(self, src): - """For EPUB files, just return the local path as absolute path""" - abs_path = os.path.abspath(os.path.expanduser(src)) - return abs_path, abs_path - - - def run(self, src, filename=None): - """Override run method to handle EPUB files directly""" - # Convert to absolute path - src = os.path.abspath(os.path.expanduser(src)) - - if not self.validate(src): - raise ValueError("Source must be a valid EPUB file") - - # Generate filename if not provided - clean_filename = filename or os.path.basename(src) - if not clean_filename.endswith('.epub'): - clean_filename += '.epub' - - self.initial_dir = os.getcwd() - with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: - with chdir(working_dir): - # Simply copy the EPUB file - shutil.copy(src, clean_filename) - - if self.debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if self.upload: - return upload_to_remarkable( - clean_filename, - remarkable_dir=self.remarkable_dir, - rmapi_path=self.rmapi_path, - ) - - # If not uploading, copy to target directory - target_path = os.path.join(self.initial_dir, clean_filename) - while os.path.exists(target_path): - base = os.path.splitext(target_path)[0] - target_path = base + "_.epub" - shutil.move(clean_filename, target_path) - - return target_path diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 34e52a3..065ac9f 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -19,7 +19,7 @@ from . import __version__ from .exceptions import InvalidURLError from .exceptions import UnidentifiedSourceError -from .providers import LocalFile, EPUBProvider +from .providers import LocalFile from .providers import providers from .utils import follow_redirects from .utils import is_url @@ -27,13 +27,8 @@ def build_argument_parser(): parser = argparse.ArgumentParser( - description="Paper2reMarkable version %s - Upload PDFs and EPUBs to reMarkable" % __version__ + description="Paper2reMarkable version %s" % __version__ ) - parser.add_argument( - "input", - help="One or more URLs to a paper or paths to local PDF/EPUB files", - nargs="?", - ) parser.add_argument( "-b", "--blank", @@ -189,22 +184,17 @@ def choose_provider(cli_input): Raised when the input *is* a valid url, but no provider can handle it. """ - provider = cookiejar = None - - # Check if it's a local file first - if os.path.exists(cli_input): + if LocalFile.validate(cli_input): + # input is a local file new_input = cli_input - # If it's an epub, use EPUBProvider - if cli_input.lower().endswith('.epub'): - provider = EPUBProvider - # Otherwise use LocalFile for PDFs - else: - provider = LocalFile + provider = LocalFile elif is_url(cli_input): + # input is a url new_input, cookiejar = follow_redirects(cli_input) provider = next((p for p in providers if p.validate(new_input)), None) else: + # not a proper URL or non-existent file raise UnidentifiedSourceError if provider is None: From 98c7c8a15838b3729b489b73a2c89d9bafbf5362 Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Thu, 14 Nov 2024 23:34:49 +0100 Subject: [PATCH 4/7] added epub support to LocalProvider --- paper2remarkable/providers/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index df4cad5..e3d8c01 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -88,6 +88,7 @@ def __init__( def _configure_operations(self, crop, blank): """Configure operations for PDF and PS formats""" # Formats that need PDF processing + # No processing for epubs is assumed pdf_formats = ['pdf', 'ps'] def add_operation(formats, operation_name, operation_func): for fmt in formats: From 3332f86c137a3f7461d79da6131fc7050f90a567 Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Thu, 14 Nov 2024 23:57:50 +0100 Subject: [PATCH 5/7] update unittest to work with operations --- paper2remarkable/providers/arxiv.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index eabcbba..fb42b39 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -40,7 +40,9 @@ def __init__(self, *args, **kwargs): self.informer = ArxivInformer() # register the dearxiv operation - self.operations.insert(0, ("dearxiv", self.dearxiv)) + for format in self.operations: + if format in "pdf ps".split(): + self.operations[format].insert(0, ("dearxiv", self.dearxiv)) def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url""" From a952e46fb2c9662332528baf89d3284119fd9fb4 Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Sun, 5 Jan 2025 13:21:34 +0100 Subject: [PATCH 6/7] added unittest and black formatting --- paper2remarkable/providers/_base.py | 22 +++++++++++----------- paper2remarkable/providers/pdf_url.py | 5 +++-- tests/test_providers.py | 7 +++++++ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index e3d8c01..64fd514 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -35,6 +35,7 @@ class Provider(metaclass=abc.ABCMeta): """ABC for providers of pdf sources""" SUPPORTED_FORMATS = ["pdf", "ps", "epub"] + def __init__( self, verbose=False, @@ -78,18 +79,16 @@ def __init__( logger.disable() # Define the operations to run on the pdf. Providers can add others. - self.operations = { - format: [] for format in self.SUPPORTED_FORMATS - } + self.operations = {format: [] for format in self.SUPPORTED_FORMATS} self._configure_operations(crop, blank) logger.info("Starting %s provider" % type(self).__name__) - def _configure_operations(self, crop, blank): """Configure operations for PDF and PS formats""" # Formats that need PDF processing # No processing for epubs is assumed - pdf_formats = ['pdf', 'ps'] + pdf_formats = ["pdf", "ps"] + def add_operation(formats, operation_name, operation_func): for fmt in formats: self.operations[fmt].append((operation_name, operation_func)) @@ -99,9 +98,9 @@ def add_operation(formats, operation_name, operation_func): # Crop operations mapping crop_operations = { - 'center': ('center', self.center_pdf), - 'right': ('right', self.right_pdf), - 'left': ('crop', self.crop_pdf) + "center": ("center", self.center_pdf), + "right": ("right", self.right_pdf), + "left": ("crop", self.crop_pdf), } # Add crop operation if specified @@ -113,7 +112,7 @@ def add_operation(formats, operation_name, operation_func): add_operation(pdf_formats, "blank", blank_pdf) # PDF-specific shrink operation - add_operation(['pdf'], "shrink", self.shrink_pdf) + add_operation(["pdf"], "shrink", self.shrink_pdf) @staticmethod @abc.abstractmethod @@ -237,8 +236,9 @@ def run(self, src, filename=None): tmp_filename = f"paper.{extension}" if extension not in self.SUPPORTED_FORMATS: - raise ValueError(f"Unsupported file format {extension}. Must be one of {self.SUPPORTED_FORMATS}") - + raise ValueError( + f"Unsupported file format {extension}. Must be one of {self.SUPPORTED_FORMATS}" + ) self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 4a58f10..f319668 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -33,11 +33,12 @@ def get_filename(self, abs_url): ) filename = path_parts[-1] - if not filename.endswith(".pdf"): + ext = filename.split(".")[-1] + if ext not in [".pdf", "epub"]: raise FilenameMissingError( provider="PdfUrl", url=abs_url, - reason="URL path didn't end in .pdf", + reason="URL path didn't end in .pdf or .epub", ) logger.warning( "Using filename {filename} extracted from url. " diff --git a/tests/test_providers.py b/tests/test_providers.py index 6cb1996..0ae0838 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -193,6 +193,13 @@ def test_pdfurl_2(self): filename = prov.run(url) self.assertEqual("NoREC.pdf", os.path.basename(filename)) + def test_epub(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "https://www.gutenberg.org/ebooks/2701.epub.images" + filename = prov.run(url) + exp = "pg2701-images.epub" + self.assertEqual(exp, os.path.basename(filename)) + def test_jmlr_1(self): prov = JMLR(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" From 2f83eb24b796a934563bf0f7d09c991405d2257c Mon Sep 17 00:00:00 2001 From: cvanelteren Date: Mon, 6 Jan 2025 08:37:30 +0100 Subject: [PATCH 7/7] fixed typo --- paper2remarkable/providers/pdf_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index f319668..11d8e0d 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -34,7 +34,7 @@ def get_filename(self, abs_url): filename = path_parts[-1] ext = filename.split(".")[-1] - if ext not in [".pdf", "epub"]: + if ext not in ["pdf", "epub"]: raise FilenameMissingError( provider="PdfUrl", url=abs_url,