Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLI: add 126 exit code for high error ratio #747

Merged
merged 3 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ def test_sysoutput():

def test_download():
"""test page download and command-line interface"""
assert cli_utils._define_exit_code([], 0) == 0
assert cli_utils._define_exit_code(["a"], 1) == 126
assert cli_utils._define_exit_code(["a"], 2) == 1

testargs = ["", "-v"]
with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
Expand All @@ -264,7 +268,7 @@ def test_download():
args = cli.parse_args(testargs)
with pytest.raises(SystemExit) as e:
cli.process_args(args)
assert e.type == SystemExit and e.value.code == 1
assert e.type == SystemExit and e.value.code == 126


# @patch('trafilatura.settings.MAX_FILES_PER_DIRECTORY', 1)
Expand Down
19 changes: 7 additions & 12 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def main() -> None:

def process_args(args: Any) -> None:
"""Perform the actual processing according to the arguments"""
error_caught = False
exit_code = 0

if args.verbose == 1:
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
Expand All @@ -211,7 +211,7 @@ def process_args(args: Any) -> None:

# fetch urls from a feed or a sitemap
if args.explore or args.feed or args.sitemap:
cli_discovery(args)
exit_code = cli_discovery(args)

# activate crawler/spider
elif args.crawl:
Expand All @@ -225,24 +225,19 @@ def process_args(args: Any) -> None:
elif args.input_dir:
file_processing_pipeline(args)

# read url list from input file
elif args.input_file:
# read url list from input file or process input URL
elif args.input_file or args.URL:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store)

# process input URL
elif args.URL:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store) # process single url
exit_code = url_processing_pipeline(args, url_store)

# read input on STDIN directly
else:
result = examine(sys.stdin.buffer.read(), args, url=args.URL)
write_result(result, args)

# change exit code if there are errors
if error_caught:
sys.exit(1)
if exit_code != 0:
sys.exit(exit_code)


if __name__ == '__main__':
Expand Down
29 changes: 22 additions & 7 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def download_queue_processing(
return errors, counter


def cli_discovery(args: Any) -> None:
def cli_discovery(args: Any) -> int:
"Group CLI functions dedicated to URL discovery."
url_store = load_input_dict(args)
input_urls = url_store.dump_urls()
Expand Down Expand Up @@ -320,14 +320,16 @@ def cli_discovery(args: Any) -> None:
reset_caches()

# process the (rest of the) links found
error_caught = url_processing_pipeline(args, url_store)
exit_code = url_processing_pipeline(args, url_store)

# activate site explorer
if args.explore:
# add to compressed dict and crawl the remaining websites
control_dict = build_exploration_dict(url_store, input_urls, args)
cli_crawler(args, url_store=control_dict, options=options)

return exit_code


def build_exploration_dict(
url_store: UrlStore, input_urls: List[str], args: Any
Expand Down Expand Up @@ -417,18 +419,31 @@ def probe_homepage(args: Any) -> None:
print(url, flush=True)


def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool:
def _define_exit_code(errors: List[str], total: int) -> int:
"""Compute exit code based on the number of errors:
0 if there are no errors, 126 if there are too many, 1 otherwise."""
ratio = len(errors) / total if total > 0 else 0

if ratio > 0.99:
return 126
if errors:
return 1
return 0


def url_processing_pipeline(args: Any, url_store: UrlStore) -> int:
"Aggregated functions to show a list and download and process an input list."
if args.list:
url_store.print_unvisited_urls() # and not write_result()
return False # and not sys.exit(0)

options = args_to_extractor(args)
counter = 0 if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY else -1
url_count = url_store.total_url_number()
counter = 0 if url_count > MAX_FILES_PER_DIRECTORY else -1

# download strategy
errors, counter = download_queue_processing(url_store, args, counter, options)
LOGGER.debug("%s URLs could not be found", len(errors))
LOGGER.debug("%s / %s URLs could not be found", len(errors), url_count)

if args.archived is True:
url_store = UrlStore()
Expand All @@ -443,9 +458,9 @@ def url_processing_pipeline(args: Any, url_store: UrlStore) -> bool:
len(errors),
)
# pass information along if URLs are missing
return bool(archived_errors)
return _define_exit_code(archived_errors, url_store.total_url_number())

return bool(errors)
return _define_exit_code(errors, url_count)


def file_processing_pipeline(args: Any) -> None:
Expand Down
Loading