Skip to content

Commit

Permalink
fix(okla): Wrap cleanup content in HTML tag
Browse files Browse the repository at this point in the history
If you dont return an <html> tag the content is
identified as plain text and causes issues
  • Loading branch information
flooie committed Oct 22, 2024
1 parent e00485a commit 4655796
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions juriscraper/opinions/united_states/state/okla.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def _process_html(self):
"summary": summary.strip(),
}
)
r = self.request["session"].get(url).content
u = self.cleanup_content(r)
print(u)
break

@staticmethod
def cleanup_content(content):
Expand All @@ -52,6 +56,9 @@ def cleanup_content(content):
so we dont end up with ugly HTML. Also we should remove a few sections
and all of the A tags to avoid hyperlinking to nowhere.
Make sure to wrap the content in an HTML tag so it can be properly
processed on CL.
:param content: The scraped HTML
:return: Cleaner HTML
"""
Expand All @@ -62,7 +69,7 @@ def cleanup_content(content):
parent = element.getparent()
if parent is not None:
parent.remove(element)

# Remove the a tags so we dont link around to broken places
for a_tag in tree.xpath("//a"):
span = html.Element("span")
span.text = a_tag.text
Expand All @@ -80,6 +87,5 @@ def cleanup_content(content):

# Find the core element with id 'oscn-content'
core_element = tree.xpath("//*[@id='oscn-content']")[0]
html_content = html.tostring(core_element).decode("ISO-8859-1")

return html_content.strip()
html_content = html.tostring(core_element).decode("ISO-8859-1").strip()
return f"<html>{html_content}</html>"

0 comments on commit 4655796

Please sign in to comment.