From bc6b681166908f0e61eb1e1e554fbf9a2a306341 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 13 Feb 2023 14:41:27 -0800 Subject: [PATCH] Fix linter issues --- readabilipy/__main__.py | 4 ++-- readabilipy/simple_json.py | 7 +++---- readabilipy/simplifiers/html.py | 2 +- readabilipy/utils.py | 2 +- tests/checks.py | 10 +++++----- tests/test_benchmarking.py | 2 +- tests/test_html_elements.py | 2 +- tests/test_simplifiers_text.py | 26 +++++++++++++------------- 8 files changed, 27 insertions(+), 28 deletions(-) diff --git a/readabilipy/__main__.py b/readabilipy/__main__.py index 3054b43..676bd4d 100644 --- a/readabilipy/__main__.py +++ b/readabilipy/__main__.py @@ -55,7 +55,7 @@ def main(): args = parser.parse_args() - with open(args.input_file) as h: + with open(args.input_file, encoding="utf-8") as h: html = h.read() article = simple_json_from_html_string( @@ -65,7 +65,7 @@ def main(): use_readability=(not args.use_python_parser), ) - with open(args.output_file, "w") as j: + with open(args.output_file, "w", encoding="utf-8") as j: json.dump(article, j, ensure_ascii=False) diff --git a/readabilipy/simple_json.py b/readabilipy/simple_json.py index d738a96..ef4206f 100644 --- a/readabilipy/simple_json.py +++ b/readabilipy/simple_json.py @@ -56,7 +56,7 @@ def simple_json_from_html_string(html, content_digests=False, node_indexes=False ["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path], cwd=jsdir) # Read output of call to Readability.parse() from JSON file and return as Python dictionary - with open(article_json_path, "r") as json_file: + with open(article_json_path, "r", encoding="utf-8") as json_file: input_json = json.load(json_file) # Deleting files after processing @@ -129,7 +129,7 @@ def plain_text_leaf_node(element): # Extract all text, stripped of any child HTML elements and normalise it plain_text = normalise_text(element.get_text()) if plain_text != "" and element.name == "li": - plain_text = "* {}, ".format(plain_text) + plain_text = f"* {plain_text}, " if plain_text == "": plain_text = None if "data-node-index" in element.attrs: @@ -210,8 +210,7 @@ def add_node_indexes(element, node_index="0"): for local_idx, child in enumerate( [c for c in element.contents if not is_text(c)], start=1): # Can't add attributes to leaf string types - child_index = "{stem}.{local}".format( - stem=node_index, local=local_idx) + child_index = f"{node_index}.{local_idx}" add_node_indexes(child, node_index=child_index) return element diff --git a/readabilipy/simplifiers/html.py b/readabilipy/simplifiers/html.py index 103d88d..f68aafd 100644 --- a/readabilipy/simplifiers/html.py +++ b/readabilipy/simplifiers/html.py @@ -300,7 +300,7 @@ def single_replace(): element.decompose() n_removed += 1 # Remove elements with only zero-length children - for element in soup.find_all(lambda elem: sum([len(c) for c in elem.children]) == 0): + for element in soup.find_all(lambda elem: sum(len(c) for c in elem.children) == 0): element.decompose() n_removed += 1 return n_removed diff --git a/readabilipy/utils.py b/readabilipy/utils.py index 1571158..54401eb 100644 --- a/readabilipy/utils.py +++ b/readabilipy/utils.py @@ -57,7 +57,7 @@ def run_npm_install(): with chdir(jsdir): try: - cp = subprocess.run(["npm", "install"]) + cp = subprocess.run(["npm", "install"], check=True) returncode = cp.returncode except FileNotFoundError: returncode = 1 diff --git a/tests/checks.py b/tests/checks.py index f3390a0..aec404d 100644 --- a/tests/checks.py +++ b/tests/checks.py @@ -36,7 +36,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals test_data_dir = "data" # Read HTML test file test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename) - with open(test_filepath) as h: + with open(test_filepath, encoding="utf-8") as h: html = h.read() # Extract simplified article HTML @@ -47,7 +47,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals # Get expected simplified article HTML expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, expected_filename) - with open(expected_filepath) as h: + with open(expected_filepath, encoding="utf-8") as h: expected_article_json = json.loads(h.read()) # Test full JSON matches (checks for unexpected fields in either actual or expected JSON) @@ -58,7 +58,7 @@ def check_extract_paragraphs_as_plain_text(test_filename, expected_filename): test_data_dir = "data" # Read readable article test file test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename) - with open(test_filepath) as h: + with open(test_filepath, encoding="utf-8") as h: article = json.loads(h.read()) # Extract plain text paragraphs @@ -67,7 +67,7 @@ def check_extract_paragraphs_as_plain_text(test_filename, expected_filename): # Get expected plain text paragraphs expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, expected_filename) - with open(expected_filepath) as h: + with open(expected_filepath, encoding="utf-8") as h: expected_paragraphs = json.loads(h.read()) # Test @@ -87,5 +87,5 @@ def check_html_output_does_not_contain_tag(test_fragment, vetoed_tag): # Check that neither nor appear in the output content = str(article_json["plain_content"]) if content is not None: - for element in ["<{}>".format(vetoed_tag), "".format(vetoed_tag)]: + for element in [f"<{vetoed_tag}>", f""]: assert element not in content diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py index 97caca9..71ae8a1 100644 --- a/tests/test_benchmarking.py +++ b/tests/test_benchmarking.py @@ -4,7 +4,7 @@ TEST_FILEPATH = os.path.join(os.path.dirname(__file__), "data", "benchmarkinghuge.html") -with open(TEST_FILEPATH) as h: +with open(TEST_FILEPATH, encoding="utf-8") as h: HTML = h.read() diff --git a/tests/test_html_elements.py b/tests/test_html_elements.py index 3a968df..ee6215c 100644 --- a/tests/test_html_elements.py +++ b/tests/test_html_elements.py @@ -957,6 +957,6 @@ def test_html_special_sup(): def test_html_remaining_element(element): """Simple standalone elements which can contain text. Check that the inner text is kept and the tag is discarded.""" - fragment = "<{0}>Lorem ipsum dolor sit amet".format(element) + fragment = f"<{element}>Lorem ipsum dolor sit amet" check_html_output_contains_text(fragment, "Lorem ipsum dolor sit amet") check_html_output_does_not_contain_tag(fragment, element) diff --git a/tests/test_simplifiers_text.py b/tests/test_simplifiers_text.py index ad60ce0..a2296a6 100644 --- a/tests/test_simplifiers_text.py +++ b/tests/test_simplifiers_text.py @@ -33,13 +33,13 @@ def test_strip_html_whitespace(): def test_strip_control_characters_non_printing_characters(): - unnormalised_string = "A string with non-printing characters in​c\u200Bluded\ufeff" + unnormalised_string = "A string with non-printing characters in\u200Bc\u200Bluded\ufeff" assert strip_control_characters(unnormalised_string) == "A string with non-printing characters included" assert normalise_text(unnormalised_string) == "A string with non-printing characters included" def test_strip_control_characters_cr(): - unnormalised_string = "A string with new lines\rin​c\u200Bluded\ufeff" + unnormalised_string = "A string with new lines\rin\u200Bc\u200Bluded\ufeff" assert strip_control_characters(unnormalised_string) == "A string with new lines\rincluded" assert normalise_text(unnormalised_string) == "A string with new lines included" @@ -51,19 +51,19 @@ def test_strip_control_characters_lf(): def test_strip_control_characters_cr_lf(): - unnormalised_string = "A string with new lines\r\nin​c\u200Bluded\ufeff" + unnormalised_string = "A string with new lines\r\nin\u200Bc\u200Bluded\ufeff" assert strip_control_characters(unnormalised_string) == "A string with new lines\r\nincluded" assert normalise_text(unnormalised_string) == "A string with new lines included" def test_strip_control_characters_ff(): - unnormalised_string = "A string with form feed\fin​c\u200Bluded\ufeff" + unnormalised_string = "A string with form feed\fin\u200Bc\u200Bluded\ufeff" assert strip_control_characters(unnormalised_string) == "A string with form feed\fincluded" assert normalise_text(unnormalised_string) == "A string with form feed included" def test_strip_control_characters_tab(): - unnormalised_string = "A string with tabs\tin​c\u200Bluded\ufeff" + unnormalised_string = "A string with tabs\tin\u200Bc\u200Bluded\ufeff" assert strip_control_characters(unnormalised_string) == "A string with tabs\tincluded" assert normalise_text(unnormalised_string) == "A string with tabs included" @@ -72,24 +72,24 @@ def test_strip_control_characters_tab(): @mark.parametrize('terminal_punctuation', text.terminal_punctuation_marks) def test_ensure_correct_punctuation_joining(terminal_punctuation): """Do not join with ' ' if the following character is a punctuation mark.""" - input_html = """ + input_html = f"""

- Some text like this{0} with punctuation. + Some text like this{terminal_punctuation} with punctuation.

-
""".format(terminal_punctuation) - expected_output = """

Some text like this{0} with punctuation.

""".format(terminal_punctuation) + """ + expected_output = f"""

Some text like this{terminal_punctuation} with punctuation.

""" check_exact_html_output(input_html, expected_output) @mark.parametrize('matched_pair', text.matched_punctuation_marks) def test_ensure_correct_bracket_quote_joining(matched_pair): """Do not join with ' ' if we are inside matched punctuation marks.""" - input_html = """ + input_html = f"""

- Some text {0}like this{1} with punctuation. + Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.

-
""".format(*matched_pair) - expected_output = """

Some text {0}like this{1} with punctuation.

""".format(*matched_pair) + """ + expected_output = f"""

Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.

""" check_exact_html_output(input_html, expected_output)