Skip to content

Commit

Permalink
Fix linter issues
Browse files Browse the repository at this point in the history
  • Loading branch information
nelson-liu committed Feb 13, 2023
1 parent 9a06895 commit bc6b681
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 28 deletions.
4 changes: 2 additions & 2 deletions readabilipy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def main():

args = parser.parse_args()

with open(args.input_file) as h:
with open(args.input_file, encoding="utf-8") as h:
html = h.read()

article = simple_json_from_html_string(
Expand All @@ -65,7 +65,7 @@ def main():
use_readability=(not args.use_python_parser),
)

with open(args.output_file, "w") as j:
with open(args.output_file, "w", encoding="utf-8") as j:
json.dump(article, j, ensure_ascii=False)


Expand Down
7 changes: 3 additions & 4 deletions readabilipy/simple_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def simple_json_from_html_string(html, content_digests=False, node_indexes=False
["node", "ExtractArticle.js", "-i", html_path, "-o", article_json_path], cwd=jsdir)

# Read output of call to Readability.parse() from JSON file and return as Python dictionary
with open(article_json_path, "r") as json_file:
with open(article_json_path, "r", encoding="utf-8") as json_file:
input_json = json.load(json_file)

# Deleting files after processing
Expand Down Expand Up @@ -129,7 +129,7 @@ def plain_text_leaf_node(element):
# Extract all text, stripped of any child HTML elements and normalise it
plain_text = normalise_text(element.get_text())
if plain_text != "" and element.name == "li":
plain_text = "* {}, ".format(plain_text)
plain_text = f"* {plain_text}, "
if plain_text == "":
plain_text = None
if "data-node-index" in element.attrs:
Expand Down Expand Up @@ -210,8 +210,7 @@ def add_node_indexes(element, node_index="0"):
for local_idx, child in enumerate(
[c for c in element.contents if not is_text(c)], start=1):
# Can't add attributes to leaf string types
child_index = "{stem}.{local}".format(
stem=node_index, local=local_idx)
child_index = f"{node_index}.{local_idx}"
add_node_indexes(child, node_index=child_index)
return element

Expand Down
2 changes: 1 addition & 1 deletion readabilipy/simplifiers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def single_replace():
element.decompose()
n_removed += 1
# Remove elements with only zero-length children
for element in soup.find_all(lambda elem: sum([len(c) for c in elem.children]) == 0):
for element in soup.find_all(lambda elem: sum(len(c) for c in elem.children) == 0):
element.decompose()
n_removed += 1
return n_removed
Expand Down
2 changes: 1 addition & 1 deletion readabilipy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run_npm_install():

with chdir(jsdir):
try:
cp = subprocess.run(["npm", "install"])
cp = subprocess.run(["npm", "install"], check=True)
returncode = cp.returncode
except FileNotFoundError:
returncode = 1
Expand Down
10 changes: 5 additions & 5 deletions tests/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals
test_data_dir = "data"
# Read HTML test file
test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename)
with open(test_filepath) as h:
with open(test_filepath, encoding="utf-8") as h:
html = h.read()

# Extract simplified article HTML
Expand All @@ -47,7 +47,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals

# Get expected simplified article HTML
expected_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, expected_filename)
with open(expected_filepath) as h:
with open(expected_filepath, encoding="utf-8") as h:
expected_article_json = json.loads(h.read())

# Test full JSON matches (checks for unexpected fields in either actual or expected JSON)
Expand All @@ -58,7 +58,7 @@ def check_extract_paragraphs_as_plain_text(test_filename, expected_filename):
test_data_dir = "data"
# Read readable article test file
test_filepath = os.path.join(os.path.dirname(__file__), test_data_dir, test_filename)
with open(test_filepath) as h:
with open(test_filepath, encoding="utf-8") as h:
article = json.loads(h.read())

# Extract plain text paragraphs
Expand All @@ -67,7 +67,7 @@ def check_extract_paragraphs_as_plain_text(test_filename, expected_filename):
# Get expected plain text paragraphs
expected_filepath = os.path.join(os.path.dirname(__file__),
test_data_dir, expected_filename)
with open(expected_filepath) as h:
with open(expected_filepath, encoding="utf-8") as h:
expected_paragraphs = json.loads(h.read())

# Test
Expand All @@ -87,5 +87,5 @@ def check_html_output_does_not_contain_tag(test_fragment, vetoed_tag):
# Check that neither <tag> nor </tag> appear in the output
content = str(article_json["plain_content"])
if content is not None:
for element in ["<{}>".format(vetoed_tag), "</{}>".format(vetoed_tag)]:
for element in [f"<{vetoed_tag}>", f"</{vetoed_tag}>"]:
assert element not in content
2 changes: 1 addition & 1 deletion tests/test_benchmarking.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


TEST_FILEPATH = os.path.join(os.path.dirname(__file__), "data", "benchmarkinghuge.html")
with open(TEST_FILEPATH) as h:
with open(TEST_FILEPATH, encoding="utf-8") as h:
HTML = h.read()


Expand Down
2 changes: 1 addition & 1 deletion tests/test_html_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,6 @@ def test_html_special_sup():
def test_html_remaining_element(element):
"""Simple standalone elements which can contain text.
Check that the inner text is kept and the tag is discarded."""
fragment = "<{0}>Lorem ipsum dolor sit amet</{0}>".format(element)
fragment = f"<{element}>Lorem ipsum dolor sit amet</{element}>"
check_html_output_contains_text(fragment, "Lorem ipsum dolor sit amet")
check_html_output_does_not_contain_tag(fragment, element)
26 changes: 13 additions & 13 deletions tests/test_simplifiers_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def test_strip_html_whitespace():


def test_strip_control_characters_non_printing_characters():
unnormalised_string = "A string with non-printing characters in​c\u200Bluded\ufeff"
unnormalised_string = "A string with non-printing characters in\u200Bc\u200Bluded\ufeff"
assert strip_control_characters(unnormalised_string) == "A string with non-printing characters included"
assert normalise_text(unnormalised_string) == "A string with non-printing characters included"


def test_strip_control_characters_cr():
unnormalised_string = "A string with new lines\rin​c\u200Bluded\ufeff"
unnormalised_string = "A string with new lines\rin\u200Bc\u200Bluded\ufeff"
assert strip_control_characters(unnormalised_string) == "A string with new lines\rincluded"
assert normalise_text(unnormalised_string) == "A string with new lines included"

Expand All @@ -51,19 +51,19 @@ def test_strip_control_characters_lf():


def test_strip_control_characters_cr_lf():
unnormalised_string = "A string with new lines\r\nin​c\u200Bluded\ufeff"
unnormalised_string = "A string with new lines\r\nin\u200Bc\u200Bluded\ufeff"
assert strip_control_characters(unnormalised_string) == "A string with new lines\r\nincluded"
assert normalise_text(unnormalised_string) == "A string with new lines included"


def test_strip_control_characters_ff():
unnormalised_string = "A string with form feed\fin​c\u200Bluded\ufeff"
unnormalised_string = "A string with form feed\fin\u200Bc\u200Bluded\ufeff"
assert strip_control_characters(unnormalised_string) == "A string with form feed\fincluded"
assert normalise_text(unnormalised_string) == "A string with form feed included"


def test_strip_control_characters_tab():
unnormalised_string = "A string with tabs\tin​c\u200Bluded\ufeff"
unnormalised_string = "A string with tabs\tin\u200Bc\u200Bluded\ufeff"
assert strip_control_characters(unnormalised_string) == "A string with tabs\tincluded"
assert normalise_text(unnormalised_string) == "A string with tabs included"

Expand All @@ -72,24 +72,24 @@ def test_strip_control_characters_tab():
@mark.parametrize('terminal_punctuation', text.terminal_punctuation_marks)
def test_ensure_correct_punctuation_joining(terminal_punctuation):
"""Do not join with ' ' if the following character is a punctuation mark."""
input_html = """
input_html = f"""
<div>
<p>
Some text <a href="example.com">like this</a>{0} with punctuation.
Some text <a href="example.com">like this</a>{terminal_punctuation} with punctuation.
</p>
</div>""".format(terminal_punctuation)
expected_output = """<div><p>Some text like this{0} with punctuation.</p></div>""".format(terminal_punctuation)
</div>"""
expected_output = f"""<div><p>Some text like this{terminal_punctuation} with punctuation.</p></div>"""
check_exact_html_output(input_html, expected_output)


@mark.parametrize('matched_pair', text.matched_punctuation_marks)
def test_ensure_correct_bracket_quote_joining(matched_pair):
"""Do not join with ' ' if we are inside matched punctuation marks."""
input_html = """
input_html = f"""
<div>
<p>
Some text {0}<a href="example.com">like this</a>{1} with punctuation.
Some text {matched_pair[0]}<a href="example.com">like this</a>{matched_pair[1]} with punctuation.
</p>
</div>""".format(*matched_pair)
expected_output = """<div><p>Some text {0}like this{1} with punctuation.</p></div>""".format(*matched_pair)
</div>"""
expected_output = f"""<div><p>Some text {matched_pair[0]}like this{matched_pair[1]} with punctuation.</p></div>"""
check_exact_html_output(input_html, expected_output)

0 comments on commit bc6b681

Please sign in to comment.