diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index 6b8f2d348..ee90a6c34 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -992,6 +992,134 @@ def _get_docket_entry_rows(self) -> List[HtmlElement]: ) return docket_entry_all_rows + def _get_attachment_number(self, row): + """Return the attachment number for an item. + + In district courts, this can be easily extracted. In bankruptcy courts, + you must extract it, then subtract 1 from the value since these are + tallied and include the main document. + """ + number = int(row.xpath(".//td/text()")[0].strip()) + if self.is_bankruptcy: + return number - 1 + return number + + def _get_description_from_tr(self, row): + """Get the description from the row""" + if not self.is_bankruptcy: + index = 2 + # Some NEFs attachment pages for some courts have an extra column + # (see nyed_123019137279), use index 3 to get the description + columns_in_row = row.xpath(f"./td") + if len(columns_in_row) == 5: + index = 3 + else: + index = 3 + + description_text_nodes = row.xpath(f"./td[{index}]//text()") + if not description_text_nodes: + # No text in the cell. + return "" + description = description_text_nodes[0].strip() + return force_unicode(description) + + @staticmethod + def _get_page_count_from_tr(tr): + """Take a row from the attachment table and return the page count as an + int extracted from the cell specified by index. + """ + pg_cnt_str_nodes = tr.xpath('./td[contains(., "page")]/text()') + if not pg_cnt_str_nodes: + # It's a restricted document without page count information. + return None + + for pg_cnt_str_node in pg_cnt_str_nodes: + try: + pg_cnt_str = pg_cnt_str_node.strip() + return int(pg_cnt_str.split()[0]) + except ValueError: + # Happens when the description field contains the + # word "page" and gets caught by the xpath. Just + # press on. + continue + + @staticmethod + def _get_file_size_str_from_tr(tr): + """Take a row from the attachment table and return the number of bytes + as an int. + """ + cells = tr.xpath("./td") + last_cell_contents = cells[-1].text_content() + units = ["kb", "mb"] + if any(unit in last_cell_contents.lower() for unit in units): + return last_cell_contents.strip() + return "" + + @staticmethod + def _get_pacer_doc_id(row, godls_row): + """Take in a row from the attachment table and return the pacer_doc_id + for the item in that row. Return None if the ID cannot be found. + """ + try: + input = row.xpath(".//input")[0] + except IndexError: + # Item exists, but cannot download document. Perhaps it's sealed + # or otherwise unavailable in PACER. This is carried over from the + # docket report and may not be needed here, but it's a good + # precaution. + return None + else: + value = input.xpath("./@value")[0] + pacer_doc_suffix = value.split("-")[0] + pacer_doc_id = None + for godls_a in godls_row.xpath(".//a"): + href = godls_a.xpath("./@href")[0] + if href.endswith(pacer_doc_suffix): + pacer_doc_id = get_pacer_doc_id_from_doc1_url(href) + break + return pacer_doc_id + + @staticmethod + def _get_pacer_seq_no_from_tr(row): + """Take a row of the attachment page, and return the sequence number + from the goDLS function. + """ + try: + input = row.xpath(".//input")[0] + except IndexError: + # No link in the row. Maybe its sealed. + pass + else: + try: + name = input.xpath("./@name")[0] + except IndexError: + # No onclick on this row. + pass + else: + return name.split("_")[2] + + return None + + def _get_attachments(self, cells, godls_row): + rows = cells.xpath("./table//tr") + + result = [] + for row in rows: + result.append( + { + "attachment_number": self._get_attachment_number(row), + "description": self._get_description_from_tr(row), + "page_count": self._get_page_count_from_tr(row), + "file_size_str": self._get_file_size_str_from_tr(row), + "pacer_doc_id": self._get_pacer_doc_id(row, godls_row), + # It may not be needed to reparse the seq_no + # for each row, but we may as well. So far, it + # has always been the same as the main document. + "pacer_seq_no": self._get_pacer_seq_no_from_tr(row), + } + ) + return result + @property def docket_entries(self): if self._docket_entries is not None: @@ -1007,6 +1135,7 @@ def docket_entries(self): if view_selected_btn: view_multiple_documents = True docket_entries = [] + godls_row = None for row in docket_entry_rows: de = {} cells = row.xpath("./td[not(./input)]") @@ -1037,8 +1166,12 @@ def docket_entries(self): date_filed_str = force_unicode(cells[0].text_content()) if not date_filed_str.strip(): + if view_multiple_documents and len(cells) >= 3: + docket_entries[-1]["attachments"] = self._get_attachments(cells[2], godls_row) # Some older dockets have missing dates. Press on. continue + elif view_multiple_documents and len(cells) >= 3: + godls_row = cells[2] de["date_filed"] = convert_date_string(date_filed_str) de["document_number"] = self._get_document_number(cells[1]) results = self._get_pacer_doc_id_and_seq_no( diff --git a/tests/examples/pacer/dockets/district/dcd_3.json b/tests/examples/pacer/dockets/district/dcd_3.json index 7b41e4789..cb15bf478 100644 --- a/tests/examples/pacer/dockets/district/dcd_3.json +++ b/tests/examples/pacer/dockets/district/dcd_3.json @@ -10,6 +10,32 @@ "demand": "", "docket_entries": [ { + "attachments": [ + { + "attachment_number": 0, + "description": "Main Document", + "file_size_str": "1.0 MB", + "pacer_doc_id": null, + "pacer_seq_no": "15", + "page_count": 64 + }, + { + "attachment_number": 1, + "description": "Civil Cover Sheet", + "file_size_str": "37.6 KB", + "pacer_doc_id": "04508117527", + "pacer_seq_no": "15", + "page_count": 2 + }, + { + "attachment_number": 2, + "description": "Summons", + "file_size_str": "73.8 KB", + "pacer_doc_id": "04508117528", + "pacer_seq_no": "15", + "page_count": 2 + } + ], "date_entered": "2020-10-20", "date_filed": "2020-10-20", "description": "COMPLAINT against GOOGLE LLC filed by UNITED STATES OF AMERICA. (Attachments: # 1 Civil Cover Sheet, # 2 Summons)(ztnr) (Entered: 10/20/2020)", @@ -58,6 +84,80 @@ "pacer_seq_no": "2039" }, { + "attachments": [ + { + "attachment_number": 0, + "description": "Main Document", + "file_size_str": "310.6 KB", + "pacer_doc_id": null, + "pacer_seq_no": "2042", + "page_count": 16 + }, + { + "attachment_number": 1, + "description": "Memorandum in Support A", + "file_size_str": "621.7 KB", + "pacer_doc_id": "04509920630", + "pacer_seq_no": "2042", + "page_count": 21 + }, + { + "attachment_number": 2, + "description": "Exhibit B", + "file_size_str": "80.6 KB", + "pacer_doc_id": "04509920631", + "pacer_seq_no": "2042", + "page_count": 1 + }, + { + "attachment_number": 3, + "description": "Exhibit C", + "file_size_str": "80.0 KB", + "pacer_doc_id": "04509920632", + "pacer_seq_no": "2042", + "page_count": 1 + }, + { + "attachment_number": 4, + "description": "Exhibit D", + "file_size_str": "80.0 KB", + "pacer_doc_id": "04509920633", + "pacer_seq_no": "2042", + "page_count": 1 + }, + { + "attachment_number": 5, + "description": "Exhibit E", + "file_size_str": "79.7 KB", + "pacer_doc_id": "04509920634", + "pacer_seq_no": "2042", + "page_count": 1 + }, + { + "attachment_number": 6, + "description": "Exhibit F", + "file_size_str": "80.0 KB", + "pacer_doc_id": "04509920635", + "pacer_seq_no": "2042", + "page_count": 1 + }, + { + "attachment_number": 7, + "description": "Exhibit G", + "file_size_str": "52.9 KB", + "pacer_doc_id": "04509920636", + "pacer_seq_no": "2042", + "page_count": 1 + }, + { + "attachment_number": 8, + "description": "Certificate of Service", + "file_size_str": "312.2 KB", + "pacer_doc_id": "04509920637", + "pacer_seq_no": "2042", + "page_count": 1 + } + ], "date_entered": "2023-05-09", "date_filed": "2023-05-09", "description": "REDACTED DOCUMENT- Plaintiff States' Motion for Leave to File a Supplemental Response to Certain Questions of the Court at Oral Argument to 584 Sealed Document, by STATE OF COLORADO. (Attachments: # 1 Memorandum in Support A, # 2 Exhibit B, # 3 Exhibit C, # 4 Exhibit D, # 5 Exhibit E, # 6 Exhibit F, # 7 Exhibit G, # 8 Certificate of Service)(Sallet, Jonathan) (Entered: 05/09/2023)",