diff --git a/juriscraper/opinions/united_states/state/okla.py b/juriscraper/opinions/united_states/state/okla.py index d75a82331..db5a176d9 100644 --- a/juriscraper/opinions/united_states/state/okla.py +++ b/juriscraper/opinions/united_states/state/okla.py @@ -9,6 +9,7 @@ from lxml import html +from juriscraper.lib.html_utils import strip_bad_html_tags_insecure from juriscraper.OpinionSiteLinear import OpinionSiteLinear @@ -45,7 +46,29 @@ def _process_html(self): @staticmethod def cleanup_content(content): - tree = html.fromstring(content) + """Remove non-opinion HTML + + :param content: The scraped HTML + :return: Cleaner HTML + """ + tree = strip_bad_html_tags_insecure(content, remove_scripts=True) + for removal_class in ["tmp-citationizer", "footer"]: + for element in tree.xpath(f"//div[@class='{removal_class}']"): + parent = element.getparent() + if parent is not None: + parent.remove(element) + + opinions_navigation = tree.xpath("//div[@id='opinons-navigation']") + if opinions_navigation: + opinions_navigation = opinions_navigation[0] + parent = opinions_navigation.getparent() + + # Remove all preceding siblings + for sibling in opinions_navigation.itersiblings(preceding=True): + parent.remove(sibling) + opinions_navigation.getparent().remove(opinions_navigation) + + # Find the core element with id 'oscn-content' core_element = tree.xpath("//*[@id='oscn-content']")[0] return html.tostring( core_element, pretty_print=True, encoding="unicode"