Skip to content

Commit

Permalink
Merge pull request #753 from ttys0dev/appelate-attachments
Browse files Browse the repository at this point in the history
  • Loading branch information
mlissner authored Dec 31, 2024
2 parents dde9d68 + a1d3db4 commit 66a4522
Show file tree
Hide file tree
Showing 13 changed files with 2,847 additions and 149 deletions.
130 changes: 129 additions & 1 deletion juriscraper/pacer/appellate_docket.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import re
import sys
from collections import OrderedDict
from typing import Any, Dict, List, Optional

from lxml.html import tostring
from lxml import html
from lxml.etree import _ElementUnicodeResult

from ..lib.judge_parsers import normalize_judge_string
from ..lib.log_tools import make_default_logger
Expand All @@ -18,6 +20,8 @@
from .reports import BaseReport
from .utils import (
get_court_id_from_url,
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
is_pdf,
)
Expand Down Expand Up @@ -562,6 +566,97 @@ def parties(self):
self._parties = parties
return parties

def _get_attachment_number(self, row: html.HtmlElement) -> int:
"""Return the attachment number for an item.
:param row: Table row as an lxml element
:return: Attachment number for row
"""
return int(row.xpath(".//td/text()")[0].strip())

def _get_description_from_tr(self, row: html.HtmlElement) -> str:
"""Get the description from the row
:param row: Table row
:return: Attachment description
"""
description_text_nodes = row.xpath(f"./td[4]//text()")
if not description_text_nodes:
# No text in the cell.
return ""
description = description_text_nodes[0].strip()
return force_unicode(description)

@staticmethod
def _get_page_count_from_tr(tr: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
count = get_input_value_from_tr(tr, 2, 4, " ")
if count is not None:
return int(count)

@staticmethod
def _get_file_size_bytes_from_tr(tr: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
file_size_str = get_input_value_from_tr(tr, 3, 4, " ")
if file_size_str is None:
return None
file_size = int(file_size_str)
if file_size == 0:
return None
return file_size

@staticmethod
def _get_pacer_doc_id(row: html.HtmlElement) -> str:
return row.xpath(".//a/@data-pacer-doc-id")

@staticmethod
def _get_pacer_seq_no_from_tr(row: html.HtmlElement) -> Optional[str]:
"""Take a row of the attachment table, and return the sequence number
from the name attribute.
"""
try:
input = row.xpath(".//input")[0]
except IndexError:
# No link in the row. Maybe its sealed.
pass
else:
try:
name = input.xpath("./@value")[0]
except IndexError:
# No onclick on this row.
pass
else:
return name.split(" ")[0]

return None

def _get_attachments(
self, cells: html.HtmlElement
) -> List[Dict[str, Any]]:
rows = cells.xpath("./table//tr//tr")[1:]
result = []
for row in rows:
attachment = {
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
# has always been the same as the main document.
"pacer_seq_no": self._get_pacer_seq_no_from_tr(row),
}
file_size_bytes = self._get_file_size_bytes_from_tr(row)
if file_size_bytes is not None:
attachment["file_size_bytes"] = file_size_bytes
result.append(attachment)
return result

@property
def docket_entries(self):
"""Get the docket entries"""
Expand All @@ -577,19 +672,38 @@ def docket_entries(self):
)
docket_entry_rows = self.tree.xpath(path)

# Detect if the report was generated with "View multiple documents"
# option enabled.
view_multiple_documents = False
view_selected_btn = self.tree.xpath("//input[@value='View Selected']")
if view_selected_btn:
view_multiple_documents = True
docket_entries = []
for row in docket_entry_rows:
de = {}
cells = row.xpath("./td")
if len(cells) == 0:
continue
if len(cells) == 1:
if cells[0].text_content() == "No docket entries found.":
break
continue

date_filed_str = force_unicode(cells[0].text_content())
if not date_filed_str.strip():
if view_multiple_documents and len(cells) >= 3:
last_de = docket_entries[-1]
attachments = self._get_attachments(cells[2])
if len(attachments) == 0:
continue
last_de["attachments"] = attachments
continue
de["date_filed"] = convert_date_string(date_filed_str)
de["document_number"] = self._get_document_number(cells[1])
de["pacer_doc_id"] = self._get_pacer_doc_id(cells[1])
pacer_seq_no = self._get_pacer_seq_no(cells[1])
if pacer_seq_no is not None:
de["pacer_seq_no"] = str(pacer_seq_no)
if not de["document_number"]:
if de["pacer_doc_id"]:
# If we lack the document number, but have
Expand Down Expand Up @@ -629,6 +743,20 @@ def _get_pacer_doc_id(cell):
doc1_url = urls[0].xpath("./@href")[0]
return get_pacer_doc_id_from_doc1_url(doc1_url)

@staticmethod
def _get_pacer_seq_no(
cell: html.HtmlElement,
) -> Optional[_ElementUnicodeResult]:
"""Take a row from the attachment table and return the input value by
index.
"""
try:
input = cell.xpath(".//input")[0]
except IndexError:
return None
else:
return input.xpath("./@value")[0]

def _get_case_name(self):
"""Get the case name."""

Expand Down
42 changes: 6 additions & 36 deletions juriscraper/pacer/attachment_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from .reports import BaseReport
from .utils import (
get_court_id_from_doc_id_prefix,
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
reverse_goDLS_function,
)
Expand Down Expand Up @@ -93,15 +95,13 @@ def data(self):
file_size_bytes = self._get_file_size_bytes_from_tr(first_row)
if file_size_bytes is not None:
result["file_size_bytes"] = file_size_bytes
result["file_size_str"] = self._get_file_size_str_from_tr(
first_row
)
result["file_size_str"] = get_file_size_str_from_tr(first_row)
for row in rows:
attachment = {
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
Expand Down Expand Up @@ -272,30 +272,12 @@ def _get_description_from_tr(self, row):
description = description_text_nodes[0].strip()
return force_unicode(description)

@staticmethod
def _get_input_value_from_tr(tr, idx):
"""Take a row from the attachment table and return the input value by
index.
"""
try:
input = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# initial value string "23515655-90555-2"
# "90555" is size in bytes "2" is pages
value = input.xpath("./@value")[0]
split_value = value.split("-")
if len(split_value) != 3:
return None
return split_value[idx]

@staticmethod
def _get_page_count_from_tr_input_value(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
count = AttachmentPage._get_input_value_from_tr(tr, 2)
count = get_input_value_from_tr(tr, 2, 3, "-")
if count is not None:
return int(count)

Expand Down Expand Up @@ -327,26 +309,14 @@ def _get_file_size_bytes_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
file_size_str = AttachmentPage._get_input_value_from_tr(tr, 1)
file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
if file_size_str is None:
return None
file_size = int(file_size_str)
if file_size == 0:
return None
return file_size

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

@staticmethod
def _get_pacer_doc_id(row):
"""Take in a row from the attachment table and return the pacer_doc_id
Expand Down
40 changes: 6 additions & 34 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from .docket_utils import normalize_party_types
from .reports import BaseReport
from .utils import (
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
get_pacer_seq_no_from_doc1_anchor,
)
Expand Down Expand Up @@ -1177,30 +1179,12 @@ def _get_attachment_id_value_from_tr(tr, idx):
return None
return split_value[idx]

@staticmethod
def _get_input_value_from_tr(tr, idx):
"""Take a row from the attachment table and return the input value by
index.
"""
try:
input = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# initial value string "23515655-90555-2"
# "90555" is size in bytes "2" is pages
value = input.xpath("./@value")[0]
split_value = value.split("-")
if len(split_value) != 3:
return None
return split_value[idx]

@staticmethod
def _get_page_count_from_tr_input_value(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
count = DocketReport._get_input_value_from_tr(tr, 2)
count = get_input_value_from_tr(tr, 2, 3, "-")
if count is not None:
return int(count)

Expand Down Expand Up @@ -1238,26 +1222,14 @@ def _get_file_size_bytes_from_tr(tr):
tr, 1
)
else:
file_size_str = DocketReport._get_input_value_from_tr(tr, 1)
file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
if file_size_str is None:
return None
file_size = int(file_size_str)
if file_size == 0:
return None
return file_size

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as a str.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

def _get_pacer_doc_id(self, row):
"""Take in a row from the attachment table and return the pacer_doc_id
for the item in that row. Return None if the ID cannot be found.
Expand All @@ -1275,7 +1247,7 @@ def _get_pacer_doc_id(self, row):
if value:
pacer_doc_suffix = value[0]
else:
pacer_doc_suffix = DocketReport._get_input_value_from_tr(row, 0)
pacer_doc_suffix = get_input_value_from_tr(row, 0, 3, "-")
if pacer_doc_suffix is None:
return None
# after inserting prefixes our final doc_id is "035023515655"
Expand Down Expand Up @@ -1315,7 +1287,7 @@ def _get_attachments(self, cells):
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
Expand Down
41 changes: 41 additions & 0 deletions juriscraper/pacer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,3 +853,44 @@ def parse_sumDocSelected_from_row(
if onclick and "sumDocSelected" in onclick[0]:
return reverse_sumDocSelected_function(onclick[0])
return None


def get_input_value_from_tr(
tr: html.HtmlElement, idx: int, expected_values: int, split_value: str
) -> Optional[str]:
"""Take a row from the attachment table and return the input value by
index.
:param tr: An HTML row element from which the input value will be extracted.
:param idx: The index of the value to extract from the split list.
:param expected_values: The expected number of elements in the split value.
:param split_value: The delimiter used to split the value string.
:return: The extracted value at the specified index or None
"""
try:
input_element = tr.xpath(".//input")[0]
except IndexError:
return None
else:
# value="6828943 14732034 1 62576"
# "62576" is size in bytes "1" is pages
# or
# value="23515655-90555-2"
# "90555" is size in bytes "2" is pages
value = input_element.xpath("./@value")[0]
split_value = value.split(split_value)
if len(split_value) != expected_values:
return None
return split_value[idx]


def get_file_size_str_from_tr(tr: html.HtmlElement) -> str:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""
Loading

0 comments on commit 66a4522

Please sign in to comment.