From bf83ee2672478a6afd660e77cbb3aa9956fbba20 Mon Sep 17 00:00:00 2001 From: Freddy Heppell Date: Wed, 10 Jul 2024 14:27:32 +0100 Subject: [PATCH] fix dl input directory path --- src/wpextract/cli/_dl.py | 4 ++-- src/wpextract/extractors/io.py | 15 ++++++++++++--- src/wpextract/util/args.py | 2 +- tests/extractors/test_io.py | 6 ++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/wpextract/cli/_dl.py b/src/wpextract/cli/_dl.py index c2b34fe..f607a29 100644 --- a/src/wpextract/cli/_dl.py +++ b/src/wpextract/cli/_dl.py @@ -3,7 +3,7 @@ from wpextract.cli._shared import _register_shared from wpextract.dl.downloader import WPDownloader from wpextract.dl.requestsession import RequestSession -from wpextract.util.args import directory +from wpextract.util.args import empty_directory dl_types = ["categories", "media", "pages", "posts", "tags", "users"] @@ -20,7 +20,7 @@ def register_dl_parser(subparsers): ) parser_dl.add_argument( "out_json", - type=directory, + type=empty_directory, help="the path of the output JSON file", ) parser_dl.add_argument( diff --git a/src/wpextract/extractors/io.py b/src/wpextract/extractors/io.py index 1cd57b0..c48c1ea 100644 --- a/src/wpextract/extractors/io.py +++ b/src/wpextract/extractors/io.py @@ -1,5 +1,6 @@ import dataclasses import json +import logging from pathlib import Path from typing import Any, List, Optional @@ -10,15 +11,18 @@ from pandas import Timestamp as PdTimestamp -def load_from_path(path: Path) -> dict: +def load_from_path(path: Path) -> Optional[dict]: """Loads and parses a JSON file. Args: path: The path to load Returns: - The decoded JSON object. + The decoded JSON object. None if the file does not exist. """ + if not path.is_file(): + return None + with open(path, "r") as f: return json.load(f) @@ -34,11 +38,16 @@ def load_df(path: Path, index_col: str = "id") -> Optional[pd.DataFrame]: index_col: The key from the JSON to use as the index Returns: - A dataframe with the flattened JSON. + A dataframe with the flattened JSON. None if the file does not exist or is an empty array. """ data_raw = load_from_path(path) + if data_raw is None: + logging.info(f"File {path} does not exist, skipping this data type.") + return None + if len(data_raw) == 0: + logging.info(f"File {path} has no data, skipping this data type.") return None return pd.json_normalize(data_raw).set_index(index_col) diff --git a/src/wpextract/util/args.py b/src/wpextract/util/args.py index ae748d2..7b7c5c3 100644 --- a/src/wpextract/util/args.py +++ b/src/wpextract/util/args.py @@ -41,7 +41,7 @@ def empty_directory(arg: str) -> Path: path = Path(arg) if path.exists() and not path.is_dir(): - raise ArgumentTypeError("is not a directory, must be an empty directory") + raise ArgumentTypeError("exists but is not a directory") try: path.mkdir(exist_ok=True) diff --git a/tests/extractors/test_io.py b/tests/extractors/test_io.py index 1ac34ac..68d7e6a 100644 --- a/tests/extractors/test_io.py +++ b/tests/extractors/test_io.py @@ -15,6 +15,12 @@ def test_load_from_path(datadir): assert loaded[0]["entry"] == "one" +def test_load_from_path_doesnt_exist(datadir): + loaded = load_from_path(datadir / "notreal.json") + + assert loaded is None + + def test_load_df(datadir): df = load_df(datadir / "example.json")