Skip to content

Commit

Permalink
Parse attachments from docket when available
Browse files Browse the repository at this point in the history
  • Loading branch information
ttys0dev committed Sep 7, 2023
1 parent df0e532 commit 94e2cd0
Show file tree
Hide file tree
Showing 6 changed files with 711 additions and 15 deletions.
143 changes: 143 additions & 0 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,142 @@ def _get_docket_entry_rows(self) -> List[HtmlElement]:
)
return docket_entry_all_rows

def _get_attachment_number(self, row):
"""Return the attachment number for an item.
In district courts, this can be easily extracted. In bankruptcy courts,
you must extract it, then subtract 1 from the value since these are
tallied and include the main document.
"""
number = int(row.xpath(".//td/text()")[0].strip())
if self.is_bankruptcy:
return number - 1
return number

def _get_description_from_tr(self, row):
"""Get the description from the row"""
if not self.is_bankruptcy:
index = 2
# Some NEFs attachment pages for some courts have an extra column
# (see nyed_123019137279), use index 3 to get the description
columns_in_row = row.xpath(f"./td")
if len(columns_in_row) == 5:
index = 3
else:
index = 3

description_text_nodes = row.xpath(f"./td[{index}]//text()")
if not description_text_nodes:
# No text in the cell.
return ""
description = description_text_nodes[0].strip()
return force_unicode(description)

@staticmethod
def _get_page_count_from_tr(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the cell specified by index.
"""
pg_cnt_str_nodes = tr.xpath('./td[contains(., "page")]/text()')
if not pg_cnt_str_nodes:
# It's a restricted document without page count information.
return None

for pg_cnt_str_node in pg_cnt_str_nodes:
try:
pg_cnt_str = pg_cnt_str_node.strip()
return int(pg_cnt_str.split()[0])
except ValueError:
# Happens when the description field contains the
# word "page" and gets caught by the xpath. Just
# press on.
continue

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

def _get_pacer_doc_id(self, row):
"""Take in a row from the attachment table and return the pacer_doc_id
for the item in that row. Return None if the ID cannot be found.
"""
try:
input = row.xpath(".//input")[0]
except IndexError:
# Item exists, but cannot download document. Perhaps it's sealed
# or otherwise unavailable in PACER. This is carried over from the
# docket report and may not be needed here, but it's a good
# precaution.
return None
else:
value = input.xpath("./@value")[0]
pacer_doc_suffix = value.split("-")[0]
return self.doc_id_prefix + "0" + pacer_doc_suffix

@staticmethod
def _get_pacer_seq_no_from_tr(row):
"""Take a row of the attachment page, and return the sequence number
from the goDLS function.
"""
try:
input = row.xpath(".//input")[0]
except IndexError:
# No link in the row. Maybe its sealed.
pass
else:
try:
name = input.xpath("./@name")[0]
except IndexError:
# No onclick on this row.
pass
else:
return name.split("_")[2]

return None

def _get_attachments(self, cells):
rows = cells.xpath("./table//tr")

result = []
for row in rows:
result.append(
{
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
# has always been the same as the main document.
"pacer_seq_no": self._get_pacer_seq_no_from_tr(row),
}
)
return result

@staticmethod
def _merge_de_with_attachment(de, attachment):
if de["pacer_doc_id"] != attachment["pacer_doc_id"]:
raise ValueError(
f"docket entry doc_id {de['pacer_doc_id']} does not match "
f"attachment 0 doc_id {attachment['pacer_doc_id']}"
)
if de["pacer_seq_no"] != attachment["pacer_seq_no"]:
raise ValueError(
f"docket entry seq_no {de['pacer_seq_no']} does not match "
f"attachment 0 seq_no {attachment['pacer_seq_no']}"
)
de["file_size_str"] = attachment["file_size_str"]
de["page_count"] = attachment["page_count"]

@property
def docket_entries(self):
if self._docket_entries is not None:
Expand Down Expand Up @@ -1037,6 +1173,13 @@ def docket_entries(self):

date_filed_str = force_unicode(cells[0].text_content())
if not date_filed_str.strip():
if view_multiple_documents and len(cells) >= 3:
last_de = docket_entries[-1]
attachments = self._get_attachments(cells[2])
if attachments[0]["attachment_number"] == 0:
de_attachment = attachments.pop(0)
self._merge_de_with_attachment(last_de, de_attachment)
last_de["attachments"] = attachments
# Some older dockets have missing dates. Press on.
continue
de["date_filed"] = convert_date_string(date_filed_str)
Expand Down
11 changes: 10 additions & 1 deletion juriscraper/pacer/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
strip_bad_html_tags_insecure,
)
from ..lib.log_tools import make_default_logger
from .utils import is_pdf, make_doc1_url, make_docs1_url
from .utils import (
get_doc_id_prefix_from_court_id,
is_pdf,
make_doc1_url,
make_docs1_url,
)

logger = make_default_logger()

Expand Down Expand Up @@ -54,6 +59,10 @@ def __init__(self, court_id, pacer_session=None):
self.response = None
self.is_valid = None

@property
def doc_id_prefix(self):
return get_doc_id_prefix_from_court_id(self.court_id)

@property
def url(self):
if self.court_id == "psc":
Expand Down
Loading

0 comments on commit 94e2cd0

Please sign in to comment.