From c861de82512899d8a7333c5fbbaede4713d1bc55 Mon Sep 17 00:00:00 2001 From: Gabriel Simmons Date: Sun, 23 Apr 2023 17:53:44 -0700 Subject: [PATCH 1/5] copy from Zotero storage --- paperqa/contrib/zotero.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/paperqa/contrib/zotero.py b/paperqa/contrib/zotero.py index 9ea9ab78..cc4cf94f 100644 --- a/paperqa/contrib/zotero.py +++ b/paperqa/contrib/zotero.py @@ -1,5 +1,6 @@ # This file gets PDF files from the user's Zotero library import os +import shutil from typing import Union, Optional from pathlib import Path import logging @@ -44,8 +45,22 @@ def __init__( library_id: Optional[str] = None, api_key: Optional[str] = None, storage: Optional[StrPath] = None, + zotero_storage: Optional[Union[StrPath,bool]] = "~/Zotero/storage/", **kwargs, ): + """Initialize the ZoteroDB object. + + Parameters + ---------- + storage: str, optional + The path to the directory where PDFs will be stored. Defaults to + `~/.paperqa/zotero`. + zotero_storage: str, optional + The path to storage directory where Zotero stores PDFs. Defaults to + `~/Zotero/storage/`. Set this to use previously-downloaded PDFs. Set to `False` to + disable this feature. + """ + self.logger = logging.getLogger("ZoteroDB") if library_id is None: @@ -76,6 +91,11 @@ def __init__( if storage is None: storage = CACHE_PATH.parent / "zotero" + + if zotero_storage: + self.zotero_storage = Path(zotero_storage).expanduser() + else: + self.zotero_storage = None self.logger.info(f"Using cache location: {storage}") self.storage = storage @@ -107,6 +127,22 @@ def get_pdf(self, item: dict) -> Union[Path, None]: pdf_path = self.storage / (pdf_key + ".pdf") if not pdf_path.exists(): + if self.zotero_storage: + self.logger.info(f"| Looking for existing PDF for: {self._get_citation_key(item)}") + try: + zotero_doc_folder = self.zotero_storage / pdf_key + + if zotero_doc_folder.exists(): + pdf_files = list(zotero_doc_folder.glob("*.pdf")) + if len(pdf_files) == 1: + self.logger.info(f"| Copying existing PDF for {self._get_citation_key(item)} from Zotero storage.") + zotero_pdf_path = zotero_doc_folder / pdf_files[0] + shutil.copy(zotero_pdf_path, pdf_path) + return pdf_path + + except Exception as e: + self.logger.warning(f"Could not copy file from Zotero storage, redownloading file. Error: {e}") + pdf_path.parent.mkdir(parents=True, exist_ok=True) self.logger.info(f"| Downloading PDF for: {_get_citation_key(item)}") self.dump(pdf_key, pdf_path) From 261acbf0295a4125d0cf3874fcd688ea22858354 Mon Sep 17 00:00:00 2001 From: Gabriel Simmons Date: Sun, 23 Apr 2023 18:05:21 -0700 Subject: [PATCH 2/5] minor fixes --- paperqa/contrib/zotero.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paperqa/contrib/zotero.py b/paperqa/contrib/zotero.py index cc4cf94f..28473d44 100644 --- a/paperqa/contrib/zotero.py +++ b/paperqa/contrib/zotero.py @@ -98,7 +98,7 @@ def __init__( self.zotero_storage = None self.logger.info(f"Using cache location: {storage}") - self.storage = storage + self.storage = Path(storage) super().__init__( library_type=library_type, library_id=library_id, api_key=api_key, **kwargs @@ -128,14 +128,14 @@ def get_pdf(self, item: dict) -> Union[Path, None]: if not pdf_path.exists(): if self.zotero_storage: - self.logger.info(f"| Looking for existing PDF for: {self._get_citation_key(item)}") + self.logger.info(f"| Looking for existing PDF for: {_get_citation_key(item)}") try: zotero_doc_folder = self.zotero_storage / pdf_key if zotero_doc_folder.exists(): pdf_files = list(zotero_doc_folder.glob("*.pdf")) if len(pdf_files) == 1: - self.logger.info(f"| Copying existing PDF for {self._get_citation_key(item)} from Zotero storage.") + self.logger.info(f"| Copying existing PDF for {_get_citation_key(item)} from Zotero storage.") zotero_pdf_path = zotero_doc_folder / pdf_files[0] shutil.copy(zotero_pdf_path, pdf_path) return pdf_path From 8dc185f3884ade4123ded12367f948e78ae0ec84 Mon Sep 17 00:00:00 2001 From: Gabriel Simmons Date: Sun, 23 Apr 2023 18:36:56 -0700 Subject: [PATCH 3/5] change zotero_storage default arg, check if zotero_storage exists --- paperqa/contrib/zotero.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/paperqa/contrib/zotero.py b/paperqa/contrib/zotero.py index 28473d44..9f218e41 100644 --- a/paperqa/contrib/zotero.py +++ b/paperqa/contrib/zotero.py @@ -45,7 +45,7 @@ def __init__( library_id: Optional[str] = None, api_key: Optional[str] = None, storage: Optional[StrPath] = None, - zotero_storage: Optional[Union[StrPath,bool]] = "~/Zotero/storage/", + zotero_storage: Optional[Union[StrPath,bool]] = None, **kwargs, ): """Initialize the ZoteroDB object. @@ -92,11 +92,18 @@ def __init__( if storage is None: storage = CACHE_PATH.parent / "zotero" - if zotero_storage: - self.zotero_storage = Path(zotero_storage).expanduser() - else: - self.zotero_storage = None + if isinstance(zotero_storage, StrPath): + self.zotero_storage = Path(zotero_storage).expanduser() + if not self.zotero_storage.exists(): + raise FileNotFoundError(f"Zotero storage directory {zotero_storage} does not exist.") + + elif zotero_storage: + self.zotero_storage = Path.home() / "Zotero" / "storage" + if not self.zotero_storage.exists(): + self.logger.warning(f"Zotero storage directory {zotero_storage} does not exist. Disabling copy from Zotero storage.") + self.zotero_storage = False + self.logger.info(f"Using cache location: {storage}") self.storage = Path(storage) From e1f1e2a43c4a79976521d09252d71a3f554410bd Mon Sep 17 00:00:00 2001 From: Gabriel Simmons Date: Sun, 23 Apr 2023 18:37:53 -0700 Subject: [PATCH 4/5] Add logging if more than one PDF is found in Zotero storage Co-authored-by: Miles Cranmer --- paperqa/contrib/zotero.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paperqa/contrib/zotero.py b/paperqa/contrib/zotero.py index 9f218e41..c83dfbd1 100644 --- a/paperqa/contrib/zotero.py +++ b/paperqa/contrib/zotero.py @@ -146,6 +146,8 @@ def get_pdf(self, item: dict) -> Union[Path, None]: zotero_pdf_path = zotero_doc_folder / pdf_files[0] shutil.copy(zotero_pdf_path, pdf_path) return pdf_path + else: + self.logger.warning("| Found more than one PDF for {_get_citation_key(item)}, so skipping.") except Exception as e: self.logger.warning(f"Could not copy file from Zotero storage, redownloading file. Error: {e}") From 754fdb62ad5ec748554cb871541162cd16867b68 Mon Sep 17 00:00:00 2001 From: Gabriel Simmons Date: Sun, 23 Apr 2023 18:39:03 -0700 Subject: [PATCH 5/5] change zotero_storage default arg --- paperqa/contrib/zotero.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paperqa/contrib/zotero.py b/paperqa/contrib/zotero.py index c83dfbd1..73c9edda 100644 --- a/paperqa/contrib/zotero.py +++ b/paperqa/contrib/zotero.py @@ -45,7 +45,7 @@ def __init__( library_id: Optional[str] = None, api_key: Optional[str] = None, storage: Optional[StrPath] = None, - zotero_storage: Optional[Union[StrPath,bool]] = None, + zotero_storage: Union[StrPath,bool] = True, **kwargs, ): """Initialize the ZoteroDB object.