Skip to content

Commit

Permalink
Merge pull request #1218 from freelawproject/improve-okla-cleanup
Browse files Browse the repository at this point in the history
feat(okla): Improve Okla HTML cleanup
  • Loading branch information
flooie authored Oct 21, 2024
2 parents 1d47cc6 + 8679a81 commit f4de988
Showing 1 changed file with 24 additions and 1 deletion.
25 changes: 24 additions & 1 deletion juriscraper/opinions/united_states/state/okla.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from lxml import html

from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -45,7 +46,29 @@ def _process_html(self):

@staticmethod
def cleanup_content(content):
tree = html.fromstring(content)
"""Remove non-opinion HTML

:param content: The scraped HTML
:return: Cleaner HTML
"""
tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
for removal_class in ["tmp-citationizer", "footer"]:
for element in tree.xpath(f"//div[@class='{removal_class}']"):
parent = element.getparent()
if parent is not None:
parent.remove(element)

opinions_navigation = tree.xpath("//div[@id='opinons-navigation']")
if opinions_navigation:
opinions_navigation = opinions_navigation[0]
parent = opinions_navigation.getparent()

# Remove all preceding siblings
for sibling in opinions_navigation.itersiblings(preceding=True):
parent.remove(sibling)
opinions_navigation.getparent().remove(opinions_navigation)

# Find the core element with id 'oscn-content'
core_element = tree.xpath("//*[@id='oscn-content']")[0]
return html.tostring(
core_element, pretty_print=True, encoding="unicode"
Expand Down

0 comments on commit f4de988

Please sign in to comment.