update docs for cli

GateNLP · Jul 11, 2024 · 6c3a346 · 6c3a346
1 parent e2f30be
commit 6c3a346
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 29 deletions.
diff --git a/docs/usage/download.md b/docs/usage/download.md
@@ -9,44 +9,46 @@ $ wpextract download TARGET OUT_JSON
 ```
 
 `TARGET`
-: The HTTP(S) URL of the WordPress site.
+:  The base path of the WordPress installation, e.g.
+  "https://example.org/"
 
 `OUT_JSON`
-: Directory to output to
+: Directory to output the downloaded JSON to. It must be an
+  existing empty directory or a non-existent directory which will be created.
 
 **optional arguments**
-`--json-prefix JSON_PREFIX`
-: Output files with a prefix, e.g. supplying _20240101-example_ will output posts to `out_dir/20240101-example-posts.json`
-
 `--media-dest`
-: Path to download media files to, skipped if not supplied. Must be an empty directory
+: Path to a directory to download media files to, skipped if not supplied
+
+`--json-prefix JSON_PREFIX`
+:  Prefix to add to output file names, e.g. supplying _20240101-example_ will output posts to `out_dir/20240101-example-posts.json`
 
 **skip data**
 
-`--no-categories` `--no-media` `--no-pages` `--no-posts` `--no-tags` `--no-users`
-: Skip downloading the given data type
+`--skip-type [categories|media|pages|posts|tags|users]`
+:  Don't download the provided types. All others will be downloaded, default is to download all.
 
 **authentication**
 
+`--proxy PROXY`
+: Proxy server for requests
+
 `--auth AUTH`
-: Define HTTP Basic credentials in format username:password
+: HTTP Basic credentials for requests (format `username:password`)
 
 `--cookies COOKIES`
-: Define cookies to send with request in the format "cookie1=foo; cookie2=bar"
-
-`--proxy PROXY`
-: Define a proxy server to use
+:  Cookies for requests (format `cookie1=foo; cookie2=bar`)
 
 **request behaviour**
 
 `--timeout TIMEOUT`
-: Stop waiting for a response after a given number of seconds (default: 30)
+: Timeout for request in seconds (default: 30)
 
 `--wait WAIT`
-: Wait the specified number of seconds between retrievals (default: None)
+:  Time to wait between requests in seconds. Does not affect retries. (default: 0)
 
 `--random-wait`
-: Randomly varies the time between requests to between 0.5 and 1.5 times the number of seconds set by –wait
+: Randomly varies the time between requests to between 0.5 and 1.5 times the number of seconds set by --wait
 
 `--max-retries MAX_RETRIES`
 : Maximum number of retries before giving up (default: 10)
@@ -59,11 +61,11 @@ $ wpextract download TARGET OUT_JSON
 
 **logging**
 
-`--log LOG`, `-l LOG`
-: Log outputs to this file instead of stdout.
+`--log FILE`, `-l FILE`
+: File to log to, will suppress stdout.
 
 `--verbose`, `-v`
-: Show additional debug logs
+: Increase log level to include debug logs
 
 ## Download Process
 

diff --git a/docs/usage/extract.md b/docs/usage/extract.md
@@ -9,27 +9,26 @@ $ wpextract extract json_root out_dir
 ```
 
 `json_root`
-: Path to files generated by the [`wpextract download`](download.md) command
+:  A directory containing a JSON dump of the data files, such as one generated with [`wpextract download`](download.md).
 
 `out_dir`
-: Output directory for generated dataset. This should be different to `json_root` as it will create files with the same name.
+: A path to output the extracted JSON to. It must be an existing empty directory or a non-existent directory which will be created.
 
 **optional arguments**
 
 `--scrape-root SCRAPE_ROOT`
 : Root directory of an HTML scrape, see [scrape crawling](#1-scrape-crawling-optional).
 
 `--json-prefix JSON_PREFIX`
-: Load and output files with a prefix, e.g. supplying _20240101-example_ will output posts to `out_dir/20240101-example-posts.json`
+: Prefix to use for input and output filenames, e.g. supplying _20240101-example_ will output posts to `out_dir/20240101-example-posts.json`
 
 **logging**
 
-`--log LOG`, `-l LOG`
-: Log outputs to this file instead of stdout.
+`--log FILE`, `-l FILE`
+: File to log to, will suppress stdout.
 
 `--verbose`, `-v`
-: Show additional debug logs
-
+: Increase log level to include debug logs
 
 
 ## Extraction Process

diff --git a/src/wpextract/cli/_download.py b/src/wpextract/cli/_download.py
@@ -42,13 +42,14 @@ def validate_wait(ctx, param, value):
 
 @click.command(short_help="Download a WordPress site.", **CMD_ARGS)
 @click.argument("target", type=str)
-@click.argument("out_json", type=click.UNPROCESSED, callback=empty_directory)
+@click.argument("out_json", type=click.Path(), callback=empty_directory)
 @click.option(
     "--media-dest",
-    type=click.UNPROCESSED,
+    type=click.Path(),
     callback=empty_directory,
     required=False,
     help="Path to a directory to download media files to, skipped if not supplied",
+    metavar="DIRECTORY"
 )
 @click.option(
     "-P", "--json-prefix", type=str, help="Prefix to add to output file names"
@@ -141,6 +142,7 @@ def download(
     OUT_JSON is the directory to output the downloaded JSON to. It must be an existing empty directory or a non-existent directory which will be created.
     """
     setup_logging(verbose, log)
+    print(verbose)
 
     types_to_dl = set(dl_types) - set(skip_types)
 

diff --git a/src/wpextract/cli/_extract.py b/src/wpextract/cli/_extract.py
@@ -16,7 +16,7 @@
 
 @click.command(short_help="Extract site to a dataset.", **CMD_ARGS)
 @click.argument("json_root", type=directory)
-@click.argument("out_dir", type=click.UNPROCESSED, callback=empty_directory)
+@click.argument("out_dir", type=click.Path(), callback=empty_directory, metavar="DIRECTORY")
 @click.option(
     "-S", "--scrape-root", help="Root directory of an HTML scrape", type=directory
 )