Merge pull request #1218 from freelawproject/improve-okla-cleanup

feat(okla): Improve Okla HTML cleanup
freelawproject · Oct 21, 2024 · f4de988 · f4de988
2 parents 1d47cc6 + 8679a81
commit f4de988
Showing 1 changed file with 24 additions and 1 deletion.
diff --git a/juriscraper/opinions/united_states/state/okla.py b/juriscraper/opinions/united_states/state/okla.py
@@ -9,6 +9,7 @@
 
 from lxml import html
 
+from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
@@ -45,7 +46,29 @@ def _process_html(self):
 
     @staticmethod
     def cleanup_content(content):
-        tree = html.fromstring(content)
+        """Remove non-opinion HTML
+
+        :param content: The scraped HTML
+        :return: Cleaner HTML
+        """
+        tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
+        for removal_class in ["tmp-citationizer", "footer"]:
+            for element in tree.xpath(f"//div[@class='{removal_class}']"):
+                parent = element.getparent()
+                if parent is not None:
+                    parent.remove(element)
+
+        opinions_navigation = tree.xpath("//div[@id='opinons-navigation']")
+        if opinions_navigation:
+            opinions_navigation = opinions_navigation[0]
+            parent = opinions_navigation.getparent()
+
+            # Remove all preceding siblings
+            for sibling in opinions_navigation.itersiblings(preceding=True):
+                parent.remove(sibling)
+            opinions_navigation.getparent().remove(opinions_navigation)
+
+        # Find the core element with id 'oscn-content'
         core_element = tree.xpath("//*[@id='oscn-content']")[0]
         return html.tostring(
             core_element, pretty_print=True, encoding="unicode"