From 0853154f947b693f5129516ca742aea55ccd4372 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Thu, 11 Jul 2024 15:34:09 +0100 Subject: [PATCH 1/2] fix name in notice file --- NOTICE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NOTICE b/NOTICE index 8140c4b..8d21fbe 100644 --- a/NOTICE +++ b/NOTICE @@ -1,4 +1,4 @@ -WordPress Site Extractor +WPextract Copyright 2022-2024 The University of Sheffield Portions of this code are derived from WPJsonScraper, which is available under the MIT license. For details, see src/wpextract/download/LICENSE. \ No newline at end of file From 32cf61afe54d8c2a56ac50b7dedfab9560c7c9b1 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Thu, 11 Jul 2024 15:44:38 +0100 Subject: [PATCH 2/2] replace prints with logs --- pyproject.toml | 3 ++- src/wpextract/cli/_download.py | 1 - src/wpextract/download/requestsession.py | 1 - src/wpextract/download/wpapi.py | 3 ++- src/wpextract/downloader.py | 10 +++++----- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 617652f..78e8e12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,8 @@ select = [ "PT", "B", "UP", - "RUF" + "RUF", + "T20" ] ignore = [ diff --git a/src/wpextract/cli/_download.py b/src/wpextract/cli/_download.py index f4ff9ee..160a6c2 100644 --- a/src/wpextract/cli/_download.py +++ b/src/wpextract/cli/_download.py @@ -142,7 +142,6 @@ def download( OUT_JSON is the directory to output the downloaded JSON to. It must be an existing empty directory or a non-existent directory which will be created. """ setup_logging(verbose, log) - print(verbose) types_to_dl = set(dl_types) - set(skip_types) diff --git a/src/wpextract/download/requestsession.py b/src/wpextract/download/requestsession.py index 7039eba..1d86068 100644 --- a/src/wpextract/download/requestsession.py +++ b/src/wpextract/download/requestsession.py @@ -278,7 +278,6 @@ def do_request(self, method, url, data=None, stream=False): logging.error(f"Connection reset by {url}") raise ConnectionReset from e else: - print(e) raise e except requests.Timeout as e: logging.error(f"Request timed out fetching {url}") diff --git a/src/wpextract/download/wpapi.py b/src/wpextract/download/wpapi.py index d6d0350..ed0782a 100644 --- a/src/wpextract/download/wpapi.py +++ b/src/wpextract/download/wpapi.py @@ -1,4 +1,5 @@ import copy +import logging import math from json.decoder import JSONDecodeError from urllib.parse import urlencode @@ -156,7 +157,7 @@ def crawl_pages( ) and "X-WP-Total" in req.headers: total_entries = int(req.headers["X-WP-Total"]) total_pages = int(req.headers["X-WP-TotalPages"]) - print("Total number of entries: %d" % total_entries) + logging.info("Total number of entries: %d" % total_entries) if start is not None and total_entries < start: start = total_entries - 1 except HTTPErrorInvalidPage: diff --git a/src/wpextract/downloader.py b/src/wpextract/downloader.py index f39eb0e..ad2f5d3 100644 --- a/src/wpextract/downloader.py +++ b/src/wpextract/downloader.py @@ -68,16 +68,16 @@ def download_media_files(self, session: RequestSession, dest: Path): session: the request session to use dest: destination directory for media """ - print("Pulling media URLs") + logging.info("Pulling media URLs") media, slugs = self.scanner.get_media_urls("all", cache=True) if len(media) == 0: logging.warning("No media found corresponding to the criteria") return - print(f"{len(media)} media URLs found") + logging.info(f"{len(media)} media URLs found") number_dl = Exporter.download_media(session, media, dest) - print(f"Downloaded {number_dl} media files") + logging.info(f"Downloaded {number_dl} media files") def _get_fetch_or_list_type(self, obj_type, plural=False): """Returns a dict containing all necessary metadata about the obj_type to list and fetch data. @@ -117,7 +117,7 @@ def _get_fetch_or_list_type(self, obj_type, plural=False): def _list_obj(self, obj_type, start=None, limit=None, cache=True): prop = self._get_fetch_or_list_type(obj_type, plural=True) - print(prop["obj_name"] + " details") + logging.info(f"Downloading {prop['obj_name']}") try: kwargs = {} @@ -138,7 +138,7 @@ def _list_obj(self, obj_type, start=None, limit=None, cache=True): logging.error("The API does not support WP V2") except OSError as e: logging.error(f"Could not open {e.filename} for writing") - print() + logging.info(f"Completed downloading {prop['obj_name']}") @staticmethod def export_decorator(