diff --git a/tests/filters_tests.py b/tests/filters_tests.py index 15d5eb04..b84d0db5 100644 --- a/tests/filters_tests.py +++ b/tests/filters_tests.py @@ -100,15 +100,21 @@ def doc3(): my_h2 = '

42

' return html.fromstring('' + my_h1 + my_h2 + my_p*50 + '') + def doc4(): + my_p = '

abc

' + return html.fromstring('' + my_p + '') + #test xpath pruning assert extract(doc(), prune_xpath='//p') == '' assert extract(doc2(), prune_xpath='//p') == 'ABC' assert extract(doc2(), prune_xpath=['//p', '//h1']) == '' assert extract(doc3(), prune_xpath=['//p', '//h1']) == '42' + assert extract(doc4(), prune_xpath=['//comment()']) == 'abc' # sanity check assert extract(doc()) != '' assert extract(doc2()) != '' assert extract(doc3()) != '' + assert extract(doc4()) != '' if __name__ == '__main__': diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index d78734fe..0b8a0b07 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -109,7 +109,7 @@ def prune_unwanted_nodes( # There is a previous node, append text to its tail prev.tail = (prev.tail or "") + " " + subtree.tail # remove the node - subtree.getparent().remove(subtree) + delete_element(subtree) if with_backup: new_len = len(tree.text_content())