diff --git a/.pylintrc b/.pylintrc index e050c18..3276277 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,6 @@ [FORMAT] max-line-length=88 -good-names=i,x1,x2,y1,y2 +good-names=i,x1,x2,y1,y2,id [MESSAGES CONTROL] disable=missing-docstring,too-many-ancestors,too-few-public-methods,no-else-return,no-member,attribute-defined-outside-init,similarities,import-outside-toplevel,cyclic-import,no-member,no-else-raise,too-many-instance-attributes,too-many-arguments,ungrouped-imports,useless-object-inheritance,no-else-continue diff --git a/documentcloud/addon.py b/documentcloud/addon.py index 4798ea5..0113437 100644 --- a/documentcloud/addon.py +++ b/documentcloud/addon.py @@ -24,7 +24,7 @@ class BaseAddOn: def __init__(self): args = self._parse_arguments() - client = self._create_client(args) + self._create_client(args) # a unique identifier for this run self.id = args.pop("id", None) @@ -65,7 +65,7 @@ def _create_client(self, args): self.client.refresh_token = args["refresh_token"] if args["token"] is not None: self.client.session.headers.update( - {"Authorization": "Bearer {}".format(args["token"])} + {"Authorization": f"Bearer {args['token']}"} ) # custom user agent for AddOns @@ -119,7 +119,7 @@ def _parse_arguments(self): # validate parameter data try: - with open("config.yaml") as config: + with open("config.yaml", encoding="utf-8") as config: schema = yaml.safe_load(config) args["data"] = fastjsonschema.validate(schema, args["data"]) # add title in case the add-on wants to reference its own title @@ -207,6 +207,8 @@ def get_document_count(self): documents = self.client.documents.search(self.query) return documents.count + return 0 + def get_documents(self): """Get documents from either selected or queried documents""" if self.documents: diff --git a/documentcloud/base.py b/documentcloud/base.py index ed6eae3..b2cb256 100644 --- a/documentcloud/base.py +++ b/documentcloud/base.py @@ -98,9 +98,7 @@ def get(self, id_, expand=None): params = {"expand": ",".join(expand)} else: params = {} - response = self.client.get( - "{}/{}/".format(self.api_path, get_id(id_)), params=params - ) + response = self.client.get(f"{self.api_path}/{get_id(id_)}/", params=params) # pylint: disable=not-callable return self.resource(self.client, response.json()) @@ -120,7 +118,7 @@ class ChildAPIClient(BaseAPIClient): """Base client for sub resources""" def __init__(self, client, parent): - super(ChildAPIClient, self).__init__(client) + super().__init__(client) self.parent = parent def list(self, **params): @@ -169,7 +167,7 @@ def delete(self): class APISet(list): def __init__(self, iterable, resource): - super(APISet, self).__init__(iterable) + super().__init__(iterable) self.resource = resource if not all(isinstance(obj, self.resource) for obj in self): raise TypeError( @@ -191,7 +189,7 @@ def append(self, obj): raise DuplicateObjectError( f"Object with ID {obj.id} appears in the list more than once" ) - super(APISet, self).append(copy(obj)) + super().append(copy(obj)) def add(self, obj): if not isinstance(obj, self.resource): @@ -200,7 +198,7 @@ def add(self, obj): ) # skip duplicates silently if obj.id not in [i.id for i in self]: - super(APISet, self).append(copy(obj)) + super().append(copy(obj)) def extend(self, list_): if not all(isinstance(obj, self.resource) for obj in list_): @@ -213,4 +211,4 @@ def extend(self, list_): raise DuplicateObjectError( f"Object with ID {id_} appears in the list more than once" ) - super(APISet, self).extend(copy(obj) for obj in list_) + super().extend(copy(obj) for obj in list_) diff --git a/documentcloud/client.py b/documentcloud/client.py index 0c8f8d4..104da37 100644 --- a/documentcloud/client.py +++ b/documentcloud/client.py @@ -169,6 +169,6 @@ def raise_for_status(self, response): response.raise_for_status() except requests.exceptions.RequestException as exc: if exc.response.status_code == 404: - raise DoesNotExistError(response=exc.response) + raise DoesNotExistError(response=exc.response) from exc else: - raise APIError(response=exc.response) + raise APIError(response=exc.response) from exc diff --git a/documentcloud/documents.py b/documentcloud/documents.py index 9202e74..d3d3d42 100644 --- a/documentcloud/documents.py +++ b/documentcloud/documents.py @@ -3,11 +3,11 @@ """ # Standard Library +import datetime import logging import os import re import warnings -import datetime from functools import partial # Third Party @@ -62,7 +62,7 @@ def __init__(self, client, dict_): dict_[f"_{name}"] = None dict_[f"{name}_id"] = value - super(Document, self).__init__(client, dict_) + super().__init__(client, dict_) self.sections = SectionClient(client, self) self.annotations = AnnotationClient(client, self) @@ -187,7 +187,10 @@ def get_page_text_url(self, page=1): return f"{self.asset_url}documents/{self.id}/pages/{self.slug}-p{page}.txt" def get_page_position_json_url(self, page=1): - return f"{self.asset_url}documents/{self.id}/pages/{self.slug}-p{page}.position.json" + return ( + f"{self.asset_url}documents/{self.id}/pages/" + f"{self.slug}-p{page}.position.json" + ) def get_json_text_url(self): return f"{self.asset_url}documents/{self.id}/{self.slug}.txt.json" @@ -265,6 +268,16 @@ def list(self, **params): def upload(self, pdf, **kwargs): """Upload a document""" + + def check_size(size): + # DocumentCloud's size limit is set to 501MB to give people a little leeway + # for OS rounding + if size >= 501 * 1024 * 1024: + raise ValueError( + "The pdf you have submitted is over the DocumentCloud API's 500MB " + "file size limit. Split it into smaller pieces and try again." + ) + # if they pass in a URL, use the URL upload flow if is_url(pdf): return self._upload_url(pdf, **kwargs) @@ -275,19 +288,13 @@ def upload(self, pdf, **kwargs): size = os.fstat(pdf.fileno()).st_size except (AttributeError, OSError): # pragma: no cover size = 0 + check_size(size) + return self._upload_file(pdf, **kwargs) else: size = os.path.getsize(pdf) - pdf = open(pdf, "rb") - - # DocumentCloud's size limit is set to 501MB to give people a little leeway - # for OS rounding - if size >= 501 * 1024 * 1024: - raise ValueError( - "The pdf you have submitted is over the DocumentCloud API's 500MB " - "file size limit. Split it into smaller pieces and try again." - ) - - return self._upload_file(pdf, **kwargs) + check_size(size) + with open(pdf, "rb") as pdf_file: + return self._upload_file(pdf_file, **kwargs) def _format_upload_parameters(self, name, **kwargs): """Prepare upload parameters from kwargs""" @@ -371,11 +378,13 @@ def _collect_files(self, path, extensions): def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs): """Upload files with specified extensions in a directory""" + # pylint: disable=too-many-locals, too-many-branches # Do not set the same title for all documents kwargs.pop("title", None) - # If extensions are specified as None, it will check for all supported filetypes. + # If extensions are specified as None, it will check for all supported + # filetypes. if extensions is None: extensions = SUPPORTED_EXTENSIONS @@ -444,9 +453,8 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg for url, file_path in zip(presigned_urls, file_paths): logger.info(f"Uploading {file_path} to S3...") try: - response = requests_retry_session().put( - url, data=open(file_path, "rb").read() - ) + with open(file_path, "rb") as file: + response = requests_retry_session().put(url, data=file.read()) self.client.raise_for_status(response) except (APIError, RequestException) as exc: if handle_errors: diff --git a/documentcloud/exceptions.py b/documentcloud/exceptions.py index e1d2107..9757be2 100644 --- a/documentcloud/exceptions.py +++ b/documentcloud/exceptions.py @@ -16,7 +16,7 @@ def __init__(self, *args, **kwargs): else: self.error = None self.status_code = None - super(DocumentCloudError, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) class DuplicateObjectError(DocumentCloudError): diff --git a/documentcloud/projects.py b/documentcloud/projects.py index 09790aa..20fd5a2 100644 --- a/documentcloud/projects.py +++ b/documentcloud/projects.py @@ -14,7 +14,7 @@ class Project(BaseAPIObject): def __init__(self, *args, **kwargs): per_page = kwargs.pop("per_page", PER_PAGE_MAX) - super(Project, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._document_list = None self._per_page = per_page @@ -23,7 +23,7 @@ def __str__(self): def save(self): """Add the documents to the project as well""" - super(Project, self).save() + super().save() if self._document_list: self.clear_documents() self.add_documents(self._document_list) @@ -100,7 +100,7 @@ def all(self, **params): return self.list(user=self.client.user_id, **params) def get(self, id=None, title=None): - # pylint:disable=redefined-builtin, arguments-differ + # pylint:disable=redefined-builtin, arguments-renamed # pylint disables are necessary for backward compatibility if id is not None and title is not None: raise ValueError( @@ -115,7 +115,7 @@ def get(self, id=None, title=None): return self.get_by_title(title) def get_by_id(self, id_): - return super(ProjectClient, self).get(id_) + return super().get(id_) def get_by_title(self, title): response = self.client.get( diff --git a/documentcloud/toolbox.py b/documentcloud/toolbox.py index e2c264f..d2e87a0 100644 --- a/documentcloud/toolbox.py +++ b/documentcloud/toolbox.py @@ -2,12 +2,14 @@ A few toys the API will use. """ +# Standard Library +from itertools import zip_longest +from urllib.parse import urlparse + # Third Party import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from urllib.parse import urlparse -from itertools import zip_longest def requests_retry_session(