Skip to content

Commit

Permalink
fix dl input directory path
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Jul 10, 2024
1 parent 2139fd7 commit bf83ee2
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 6 deletions.
4 changes: 2 additions & 2 deletions src/wpextract/cli/_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from wpextract.cli._shared import _register_shared
from wpextract.dl.downloader import WPDownloader
from wpextract.dl.requestsession import RequestSession
from wpextract.util.args import directory
from wpextract.util.args import empty_directory

dl_types = ["categories", "media", "pages", "posts", "tags", "users"]

Expand All @@ -20,7 +20,7 @@ def register_dl_parser(subparsers):
)
parser_dl.add_argument(
"out_json",
type=directory,
type=empty_directory,
help="the path of the output JSON file",
)
parser_dl.add_argument(
Expand Down
15 changes: 12 additions & 3 deletions src/wpextract/extractors/io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import dataclasses
import json
import logging
from pathlib import Path
from typing import Any, List, Optional

Expand All @@ -10,15 +11,18 @@
from pandas import Timestamp as PdTimestamp


def load_from_path(path: Path) -> dict:
def load_from_path(path: Path) -> Optional[dict]:
"""Loads and parses a JSON file.
Args:
path: The path to load
Returns:
The decoded JSON object.
The decoded JSON object. None if the file does not exist.
"""
if not path.is_file():
return None

with open(path, "r") as f:
return json.load(f)

Expand All @@ -34,11 +38,16 @@ def load_df(path: Path, index_col: str = "id") -> Optional[pd.DataFrame]:
index_col: The key from the JSON to use as the index
Returns:
A dataframe with the flattened JSON.
A dataframe with the flattened JSON. None if the file does not exist or is an empty array.
"""
data_raw = load_from_path(path)

if data_raw is None:
logging.info(f"File {path} does not exist, skipping this data type.")
return None

if len(data_raw) == 0:
logging.info(f"File {path} has no data, skipping this data type.")
return None

return pd.json_normalize(data_raw).set_index(index_col)
Expand Down
2 changes: 1 addition & 1 deletion src/wpextract/util/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def empty_directory(arg: str) -> Path:
path = Path(arg)

if path.exists() and not path.is_dir():
raise ArgumentTypeError("is not a directory, must be an empty directory")
raise ArgumentTypeError("exists but is not a directory")

try:
path.mkdir(exist_ok=True)
Expand Down
6 changes: 6 additions & 0 deletions tests/extractors/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ def test_load_from_path(datadir):
assert loaded[0]["entry"] == "one"


def test_load_from_path_doesnt_exist(datadir):
loaded = load_from_path(datadir / "notreal.json")

assert loaded is None


def test_load_df(datadir):
df = load_df(datadir / "example.json")

Expand Down

0 comments on commit bf83ee2

Please sign in to comment.