Skip to content

Commit

Permalink
Parse attachments from docket when available
Browse files Browse the repository at this point in the history
  • Loading branch information
ttys0dev committed Sep 6, 2023
1 parent d2c6672 commit cdf6b87
Show file tree
Hide file tree
Showing 2 changed files with 233 additions and 0 deletions.
133 changes: 133 additions & 0 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,134 @@ def _get_docket_entry_rows(self) -> List[HtmlElement]:
)
return docket_entry_all_rows

def _get_attachment_number(self, row):
"""Return the attachment number for an item.
In district courts, this can be easily extracted. In bankruptcy courts,
you must extract it, then subtract 1 from the value since these are
tallied and include the main document.
"""
number = int(row.xpath(".//td/text()")[0].strip())
if self.is_bankruptcy:
return number - 1
return number

def _get_description_from_tr(self, row):
"""Get the description from the row"""
if not self.is_bankruptcy:
index = 2
# Some NEFs attachment pages for some courts have an extra column
# (see nyed_123019137279), use index 3 to get the description
columns_in_row = row.xpath(f"./td")
if len(columns_in_row) == 5:
index = 3
else:
index = 3

description_text_nodes = row.xpath(f"./td[{index}]//text()")
if not description_text_nodes:
# No text in the cell.
return ""
description = description_text_nodes[0].strip()
return force_unicode(description)

@staticmethod
def _get_page_count_from_tr(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the cell specified by index.
"""
pg_cnt_str_nodes = tr.xpath('./td[contains(., "page")]/text()')
if not pg_cnt_str_nodes:
# It's a restricted document without page count information.
return None

for pg_cnt_str_node in pg_cnt_str_nodes:
try:
pg_cnt_str = pg_cnt_str_node.strip()
return int(pg_cnt_str.split()[0])
except ValueError:
# Happens when the description field contains the
# word "page" and gets caught by the xpath. Just
# press on.
continue

@staticmethod
def _get_file_size_str_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
cells = tr.xpath("./td")
last_cell_contents = cells[-1].text_content()
units = ["kb", "mb"]
if any(unit in last_cell_contents.lower() for unit in units):
return last_cell_contents.strip()
return ""

@staticmethod
def _get_pacer_doc_id(row, godls_row):
"""Take in a row from the attachment table and return the pacer_doc_id
for the item in that row. Return None if the ID cannot be found.
"""
try:
input = row.xpath(".//input")[0]
except IndexError:
# Item exists, but cannot download document. Perhaps it's sealed
# or otherwise unavailable in PACER. This is carried over from the
# docket report and may not be needed here, but it's a good
# precaution.
return None
else:
value = input.xpath("./@value")[0]
pacer_doc_suffix = value.split("-")[0]
pacer_doc_id = None
for godls_a in godls_row.xpath(".//a"):
href = godls_a.xpath("./@href")[0]
if href.endswith(pacer_doc_suffix):
pacer_doc_id = get_pacer_doc_id_from_doc1_url(href)
break
return pacer_doc_id

@staticmethod
def _get_pacer_seq_no_from_tr(row):
"""Take a row of the attachment page, and return the sequence number
from the goDLS function.
"""
try:
input = row.xpath(".//input")[0]
except IndexError:
# No link in the row. Maybe its sealed.
pass
else:
try:
name = input.xpath("./@name")[0]
except IndexError:
# No onclick on this row.
pass
else:
return name.split("_")[2]

return None

def _get_attachments(self, cells, godls_row):
rows = cells.xpath("./table//tr")

result = []
for row in rows:
result.append(
{
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"file_size_str": self._get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row, godls_row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
# has always been the same as the main document.
"pacer_seq_no": self._get_pacer_seq_no_from_tr(row),
}
)
return result

@property
def docket_entries(self):
if self._docket_entries is not None:
Expand All @@ -1007,6 +1135,7 @@ def docket_entries(self):
if view_selected_btn:
view_multiple_documents = True
docket_entries = []
godls_row = None
for row in docket_entry_rows:
de = {}
cells = row.xpath("./td[not(./input)]")
Expand Down Expand Up @@ -1037,8 +1166,12 @@ def docket_entries(self):

date_filed_str = force_unicode(cells[0].text_content())
if not date_filed_str.strip():
if view_multiple_documents and len(cells) >= 3:
docket_entries[-1]["attachments"] = self._get_attachments(cells[2], godls_row)
# Some older dockets have missing dates. Press on.
continue
elif view_multiple_documents and len(cells) >= 3:
godls_row = cells[2]
de["date_filed"] = convert_date_string(date_filed_str)
de["document_number"] = self._get_document_number(cells[1])
results = self._get_pacer_doc_id_and_seq_no(
Expand Down
100 changes: 100 additions & 0 deletions tests/examples/pacer/dockets/district/dcd_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,32 @@
"demand": "",
"docket_entries": [
{
"attachments": [
{
"attachment_number": 0,
"description": "Main Document",
"file_size_str": "1.0 MB",
"pacer_doc_id": null,
"pacer_seq_no": "15",
"page_count": 64
},
{
"attachment_number": 1,
"description": "Civil Cover Sheet",
"file_size_str": "37.6 KB",
"pacer_doc_id": "04508117527",
"pacer_seq_no": "15",
"page_count": 2
},
{
"attachment_number": 2,
"description": "Summons",
"file_size_str": "73.8 KB",
"pacer_doc_id": "04508117528",
"pacer_seq_no": "15",
"page_count": 2
}
],
"date_entered": "2020-10-20",
"date_filed": "2020-10-20",
"description": "COMPLAINT against GOOGLE LLC filed by UNITED STATES OF AMERICA. (Attachments: # 1 Civil Cover Sheet, # 2 Summons)(ztnr) (Entered: 10/20/2020)",
Expand Down Expand Up @@ -58,6 +84,80 @@
"pacer_seq_no": "2039"
},
{
"attachments": [
{
"attachment_number": 0,
"description": "Main Document",
"file_size_str": "310.6 KB",
"pacer_doc_id": null,
"pacer_seq_no": "2042",
"page_count": 16
},
{
"attachment_number": 1,
"description": "Memorandum in Support A",
"file_size_str": "621.7 KB",
"pacer_doc_id": "04509920630",
"pacer_seq_no": "2042",
"page_count": 21
},
{
"attachment_number": 2,
"description": "Exhibit B",
"file_size_str": "80.6 KB",
"pacer_doc_id": "04509920631",
"pacer_seq_no": "2042",
"page_count": 1
},
{
"attachment_number": 3,
"description": "Exhibit C",
"file_size_str": "80.0 KB",
"pacer_doc_id": "04509920632",
"pacer_seq_no": "2042",
"page_count": 1
},
{
"attachment_number": 4,
"description": "Exhibit D",
"file_size_str": "80.0 KB",
"pacer_doc_id": "04509920633",
"pacer_seq_no": "2042",
"page_count": 1
},
{
"attachment_number": 5,
"description": "Exhibit E",
"file_size_str": "79.7 KB",
"pacer_doc_id": "04509920634",
"pacer_seq_no": "2042",
"page_count": 1
},
{
"attachment_number": 6,
"description": "Exhibit F",
"file_size_str": "80.0 KB",
"pacer_doc_id": "04509920635",
"pacer_seq_no": "2042",
"page_count": 1
},
{
"attachment_number": 7,
"description": "Exhibit G",
"file_size_str": "52.9 KB",
"pacer_doc_id": "04509920636",
"pacer_seq_no": "2042",
"page_count": 1
},
{
"attachment_number": 8,
"description": "Certificate of Service",
"file_size_str": "312.2 KB",
"pacer_doc_id": "04509920637",
"pacer_seq_no": "2042",
"page_count": 1
}
],
"date_entered": "2023-05-09",
"date_filed": "2023-05-09",
"description": "REDACTED DOCUMENT- Plaintiff States' Motion for Leave to File a Supplemental Response to Certain Questions of the Court at Oral Argument to 584 Sealed Document, by STATE OF COLORADO. (Attachments: # 1 Memorandum in Support A, # 2 Exhibit B, # 3 Exhibit C, # 4 Exhibit D, # 5 Exhibit E, # 6 Exhibit F, # 7 Exhibit G, # 8 Certificate of Service)(Sallet, Jonathan) (Entered: 05/09/2023)",
Expand Down

0 comments on commit cdf6b87

Please sign in to comment.