Fix linter issues

alan-turing-institute · Feb 13, 2023 · bc6b681 · bc6b681
1 parent 9a06895
commit bc6b681
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 28 deletions.
diff --git a/readabilipy/__main__.py b/readabilipy/__main__.py
@@ -55,7 +55,7 @@ def main():
 
     args = parser.parse_args()
 
-    with open(args.input_file) as h:
+    with open(args.input_file, encoding="utf-8") as h:
         html = h.read()
 
     article = simple_json_from_html_string(
@@ -65,7 +65,7 @@ def main():
         use_readability=(not args.use_python_parser),
     )
 
-    with open(args.output_file, "w") as j:
+    with open(args.output_file, "w", encoding="utf-8") as j:
         json.dump(article, j, ensure_ascii=False)
 
 

diff --git a/readabilipy/simple_json.py b/readabilipy/simple_json.py
@@ -56,7 +56,7 @@ def simple_json_from_html_string(html, content_digests=False, node_indexes=False
             ["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path], cwd=jsdir)
 
         # Read output of call to Readability.parse() from JSON file and return as Python dictionary
-        with open(article_json_path, "r") as json_file:
+        with open(article_json_path, "r", encoding="utf-8") as json_file:
             input_json = json.load(json_file)
 
         # Deleting files after processing
@@ -129,7 +129,7 @@ def plain_text_leaf_node(element):
     # Extract all text, stripped of any child HTML elements and normalise it
     plain_text = normalise_text(element.get_text())
     if plain_text != "" and element.name == "li":
-        plain_text = "* {}, ".format(plain_text)
+        plain_text = f"* {plain_text}, "
     if plain_text == "":
         plain_text = None
     if "data-node-index" in element.attrs:
@@ -210,8 +210,7 @@ def add_node_indexes(element, node_index="0"):
     for local_idx, child in enumerate(
             [c for c in element.contents if not is_text(c)], start=1):
         # Can't add attributes to leaf string types
-        child_index = "{stem}.{local}".format(
-            stem=node_index, local=local_idx)
+        child_index = f"{node_index}.{local_idx}"
         add_node_indexes(child, node_index=child_index)
     return element
 

diff --git a/readabilipy/simplifiers/html.py b/readabilipy/simplifiers/html.py
@@ -300,7 +300,7 @@ def single_replace():
             element.decompose()
             n_removed += 1
         # Remove elements with only zero-length children
-        for element in soup.find_all(lambda elem: sum([len(c) for c in elem.children]) == 0):
+        for element in soup.find_all(lambda elem: sum(len(c) for c in elem.children) == 0):
             element.decompose()
             n_removed += 1
         return n_removed

diff --git a/readabilipy/utils.py b/readabilipy/utils.py
@@ -57,7 +57,7 @@ def run_npm_install():
 
     with chdir(jsdir):
         try:
-            cp = subprocess.run(["npm", "install"])
+            cp = subprocess.run(["npm", "install"], check=True)
             returncode = cp.returncode
         except FileNotFoundError:
             returncode = 1

diff --git a/tests/checks.py b/tests/checks.py
@@ -36,7 +36,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals
     test_data_dir = "data"
     # Read HTML test file
     test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename)
-    with open(test_filepath) as h:
+    with open(test_filepath, encoding="utf-8") as h:
         html = h.read()
 
     # Extract simplified article HTML
@@ -47,7 +47,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals
 
     # Get expected simplified article HTML
     expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, expected_filename)
-    with open(expected_filepath) as h:
+    with open(expected_filepath, encoding="utf-8") as h:
         expected_article_json = json.loads(h.read())
 
     # Test full JSON matches (checks for unexpected fields in either actual or expected JSON)
@@ -58,7 +58,7 @@ def check_extract_paragraphs_as_plain_text(test_filename, expected_filename):
     test_data_dir = "data"
     # Read readable article test file
     test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename)
-    with open(test_filepath) as h:
+    with open(test_filepath, encoding="utf-8") as h:
         article = json.loads(h.read())
 
     # Extract plain text paragraphs
@@ -67,7 +67,7 @@ def check_extract_paragraphs_as_plain_text(test_filename, expected_filename):
     # Get expected plain text paragraphs
     expected_filepath = os.path.join(os.path.dirname(__file__),
                                      test_data_dir, expected_filename)
-    with open(expected_filepath) as h:
+    with open(expected_filepath, encoding="utf-8") as h:
         expected_paragraphs = json.loads(h.read())
 
     # Test
@@ -87,5 +87,5 @@ def check_html_output_does_not_contain_tag(test_fragment, vetoed_tag):
     # Check that neither <tag> nor </tag> appear in the output
     content = str(article_json["plain_content"])
     if content is not None:
-        for element in ["<{}>".format(vetoed_tag), "</{}>".format(vetoed_tag)]:
+        for element in [f"<{vetoed_tag}>", f"</{vetoed_tag}>"]:
             assert element not in content
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
@@ -4,7 +4,7 @@
 
 
 TEST_FILEPATH = os.path.join(os.path.dirname(__file__), "data", "benchmarkinghuge.html")
-with open(TEST_FILEPATH) as h:
+with open(TEST_FILEPATH, encoding="utf-8") as h:
     HTML = h.read()
 
 

diff --git a/tests/test_html_elements.py b/tests/test_html_elements.py
@@ -957,6 +957,6 @@ def test_html_special_sup():
 def test_html_remaining_element(element):
     """Simple standalone elements which can contain text.
        Check that the inner text is kept and the tag is discarded."""
-    fragment = "<{0}>Lorem ipsum dolor sit amet</{0}>".format(element)
+    fragment = f"<{element}>Lorem ipsum dolor sit amet</{element}>"
     check_html_output_contains_text(fragment, "Lorem ipsum dolor sit amet")
     check_html_output_does_not_contain_tag(fragment, element)
diff --git a/tests/test_simplifiers_text.py b/tests/test_simplifiers_text.py
@@ -33,13 +33,13 @@ def test_strip_html_whitespace():
 
 
 def test_strip_control_characters_non_printing_characters():
-    unnormalised_string = "A string with non-printing characters inc\u200Bluded\ufeff"
+    unnormalised_string = "A string with non-printing characters in\u200Bc\u200Bluded\ufeff"
     assert strip_control_characters(unnormalised_string) == "A string with non-printing characters included"
     assert normalise_text(unnormalised_string) == "A string with non-printing characters included"
 
 
 def test_strip_control_characters_cr():
-    unnormalised_string = "A string with new lines\rinc\u200Bluded\ufeff"
+    unnormalised_string = "A string with new lines\rin\u200Bc\u200Bluded\ufeff"
     assert strip_control_characters(unnormalised_string) == "A string with new lines\rincluded"
     assert normalise_text(unnormalised_string) == "A string with new lines included"
 
@@ -51,19 +51,19 @@ def test_strip_control_characters_lf():
 
 
 def test_strip_control_characters_cr_lf():
-    unnormalised_string = "A string with new lines\r\ninc\u200Bluded\ufeff"
+    unnormalised_string = "A string with new lines\r\nin\u200Bc\u200Bluded\ufeff"
     assert strip_control_characters(unnormalised_string) == "A string with new lines\r\nincluded"
     assert normalise_text(unnormalised_string) == "A string with new lines included"
 
 
 def test_strip_control_characters_ff():
-    unnormalised_string = "A string with form feed\finc\u200Bluded\ufeff"
+    unnormalised_string = "A string with form feed\fin\u200Bc\u200Bluded\ufeff"
     assert strip_control_characters(unnormalised_string) == "A string with form feed\fincluded"
     assert normalise_text(unnormalised_string) == "A string with form feed included"
 
 
 def test_strip_control_characters_tab():
-    unnormalised_string = "A string with tabs\tinc\u200Bluded\ufeff"
+    unnormalised_string = "A string with tabs\tin\u200Bc\u200Bluded\ufeff"
     assert strip_control_characters(unnormalised_string) == "A string with tabs\tincluded"
     assert normalise_text(unnormalised_string) == "A string with tabs included"
 
@@ -72,24 +72,24 @@ def test_strip_control_characters_tab():
 @mark.parametrize('terminal_punctuation', text.terminal_punctuation_marks)
 def test_ensure_correct_punctuation_joining(terminal_punctuation):
     """Do not join with ' ' if the following character is a punctuation mark."""
-    input_html = """
+    input_html = f"""
         <div>
             <p>
-                Some text <a href="example.com">like this</a>{0} with punctuation.
+                Some text <a href="example.com">like this</a>{terminal_punctuation} with punctuation.
             </p>
-        </div>""".format(terminal_punctuation)
-    expected_output = """<div><p>Some text like this{0} with punctuation.</p></div>""".format(terminal_punctuation)
+        </div>"""
+    expected_output = f"""<div><p>Some text like this{terminal_punctuation} with punctuation.</p></div>"""
     check_exact_html_output(input_html, expected_output)
 
 
 @mark.parametrize('matched_pair', text.matched_punctuation_marks)
 def test_ensure_correct_bracket_quote_joining(matched_pair):
     """Do not join with ' ' if we are inside matched punctuation marks."""
-    input_html = """
+    input_html = f"""
         <div>
             <p>
-                Some text {0}<a href="example.com">like this</a>{1} with punctuation.
+                Some text {matched_pair[0]}<a href="example.com">like this</a>{matched_pair[1]} with punctuation.
             </p>
-        </div>""".format(*matched_pair)
-    expected_output = """<div><p>Some text {0}like this{1} with punctuation.</p></div>""".format(*matched_pair)
+        </div>"""
+    expected_output = f"""<div><p>Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.</p></div>"""
     check_exact_html_output(input_html, expected_output)