Parse attachments from docket when available

freelawproject · Sep 6, 2023 · cdf6b87 · cdf6b87
1 parent d2c6672
commit cdf6b87
Show file tree

Hide file tree

Showing 2 changed files with 233 additions and 0 deletions.
diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py
@@ -992,6 +992,134 @@ def _get_docket_entry_rows(self) -> List[HtmlElement]:
         )
         return docket_entry_all_rows
 
+    def _get_attachment_number(self, row):
+        """Return the attachment number for an item.
+
+        In district courts, this can be easily extracted. In bankruptcy courts,
+        you must extract it, then subtract 1 from the value since these are
+        tallied and include the main document.
+        """
+        number = int(row.xpath(".//td/text()")[0].strip())
+        if self.is_bankruptcy:
+            return number - 1
+        return number
+
+    def _get_description_from_tr(self, row):
+        """Get the description from the row"""
+        if not self.is_bankruptcy:
+            index = 2
+            # Some NEFs attachment pages for some courts have an extra column
+            # (see nyed_123019137279), use index 3 to get the description
+            columns_in_row = row.xpath(f"./td")
+            if len(columns_in_row) == 5:
+                index = 3
+        else:
+            index = 3
+
+        description_text_nodes = row.xpath(f"./td[{index}]//text()")
+        if not description_text_nodes:
+            # No text in the cell.
+            return ""
+        description = description_text_nodes[0].strip()
+        return force_unicode(description)
+
+    @staticmethod
+    def _get_page_count_from_tr(tr):
+        """Take a row from the attachment table and return the page count as an
+        int extracted from the cell specified by index.
+        """
+        pg_cnt_str_nodes = tr.xpath('./td[contains(., "page")]/text()')
+        if not pg_cnt_str_nodes:
+            # It's a restricted document without page count information.
+            return None
+
+        for pg_cnt_str_node in pg_cnt_str_nodes:
+            try:
+                pg_cnt_str = pg_cnt_str_node.strip()
+                return int(pg_cnt_str.split()[0])
+            except ValueError:
+                # Happens when the description field contains the
+                # word "page" and gets caught by the xpath. Just
+                # press on.
+                continue
+
+    @staticmethod
+    def _get_file_size_str_from_tr(tr):
+        """Take a row from the attachment table and return the number of bytes
+        as an int.
+        """
+        cells = tr.xpath("./td")
+        last_cell_contents = cells[-1].text_content()
+        units = ["kb", "mb"]
+        if any(unit in last_cell_contents.lower() for unit in units):
+            return last_cell_contents.strip()
+        return ""
+
+    @staticmethod
+    def _get_pacer_doc_id(row, godls_row):
+        """Take in a row from the attachment table and return the pacer_doc_id
+        for the item in that row. Return None if the ID cannot be found.
+        """
+        try:
+            input = row.xpath(".//input")[0]
+        except IndexError:
+            # Item exists, but cannot download document. Perhaps it's sealed
+            # or otherwise unavailable in PACER. This is carried over from the
+            # docket report and may not be needed here, but it's a good
+            # precaution.
+            return None
+        else:
+            value = input.xpath("./@value")[0]
+            pacer_doc_suffix = value.split("-")[0]
+            pacer_doc_id = None
+            for godls_a in godls_row.xpath(".//a"):
+                href = godls_a.xpath("./@href")[0]
+                if href.endswith(pacer_doc_suffix):
+                    pacer_doc_id = get_pacer_doc_id_from_doc1_url(href)
+                    break
+            return pacer_doc_id
+
+    @staticmethod
+    def _get_pacer_seq_no_from_tr(row):
+        """Take a row of the attachment page, and return the sequence number
+        from the goDLS function.
+        """
+        try:
+            input = row.xpath(".//input")[0]
+        except IndexError:
+            # No link in the row. Maybe its sealed.
+            pass
+        else:
+            try:
+                name = input.xpath("./@name")[0]
+            except IndexError:
+                # No onclick on this row.
+                pass
+            else:
+                return name.split("_")[2]
+
+        return None
+
+    def _get_attachments(self, cells, godls_row):
+        rows = cells.xpath("./table//tr")
+
+        result = []
+        for row in rows:
+            result.append(
+                {
+                    "attachment_number": self._get_attachment_number(row),
+                    "description": self._get_description_from_tr(row),
+                    "page_count": self._get_page_count_from_tr(row),
+                    "file_size_str": self._get_file_size_str_from_tr(row),
+                    "pacer_doc_id": self._get_pacer_doc_id(row, godls_row),
+                    # It may not be needed to reparse the seq_no
+                    # for each row, but we may as well. So far, it
+                    # has always been the same as the main document.
+                    "pacer_seq_no": self._get_pacer_seq_no_from_tr(row),
+                }
+            )
+        return result
+
     @property
     def docket_entries(self):
         if self._docket_entries is not None:
@@ -1007,6 +1135,7 @@ def docket_entries(self):
         if view_selected_btn:
             view_multiple_documents = True
         docket_entries = []
+        godls_row = None
         for row in docket_entry_rows:
             de = {}
             cells = row.xpath("./td[not(./input)]")
@@ -1037,8 +1166,12 @@ def docket_entries(self):
 
             date_filed_str = force_unicode(cells[0].text_content())
             if not date_filed_str.strip():
+                if view_multiple_documents and len(cells) >= 3:
+                    docket_entries[-1]["attachments"] = self._get_attachments(cells[2], godls_row)
                 # Some older dockets have missing dates. Press on.
                 continue
+            elif view_multiple_documents and len(cells) >= 3:
+                godls_row = cells[2]
             de["date_filed"] = convert_date_string(date_filed_str)
             de["document_number"] = self._get_document_number(cells[1])
             results = self._get_pacer_doc_id_and_seq_no(

diff --git a/tests/examples/pacer/dockets/district/dcd_3.json b/tests/examples/pacer/dockets/district/dcd_3.json
@@ -10,6 +10,32 @@
   "demand": "",
   "docket_entries": [
     {
+      "attachments": [
+        {
+          "attachment_number": 0,
+          "description": "Main Document",
+          "file_size_str": "1.0 MB",
+          "pacer_doc_id": null,
+          "pacer_seq_no": "15",
+          "page_count": 64
+        },
+        {
+          "attachment_number": 1,
+          "description": "Civil Cover Sheet",
+          "file_size_str": "37.6 KB",
+          "pacer_doc_id": "04508117527",
+          "pacer_seq_no": "15",
+          "page_count": 2
+        },
+        {
+          "attachment_number": 2,
+          "description": "Summons",
+          "file_size_str": "73.8 KB",
+          "pacer_doc_id": "04508117528",
+          "pacer_seq_no": "15",
+          "page_count": 2
+        }
+      ],
       "date_entered": "2020-10-20",
       "date_filed": "2020-10-20",
       "description": "COMPLAINT against GOOGLE LLC filed by UNITED STATES OF AMERICA. (Attachments: # 1 Civil Cover Sheet, # 2 Summons)(ztnr) (Entered: 10/20/2020)",
@@ -58,6 +84,80 @@
       "pacer_seq_no": "2039"
     },
     {
+      "attachments": [
+        {
+          "attachment_number": 0,
+          "description": "Main Document",
+          "file_size_str": "310.6 KB",
+          "pacer_doc_id": null,
+          "pacer_seq_no": "2042",
+          "page_count": 16
+        },
+        {
+          "attachment_number": 1,
+          "description": "Memorandum in Support A",
+          "file_size_str": "621.7 KB",
+          "pacer_doc_id": "04509920630",
+          "pacer_seq_no": "2042",
+          "page_count": 21
+        },
+        {
+          "attachment_number": 2,
+          "description": "Exhibit B",
+          "file_size_str": "80.6 KB",
+          "pacer_doc_id": "04509920631",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        },
+        {
+          "attachment_number": 3,
+          "description": "Exhibit C",
+          "file_size_str": "80.0 KB",
+          "pacer_doc_id": "04509920632",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        },
+        {
+          "attachment_number": 4,
+          "description": "Exhibit D",
+          "file_size_str": "80.0 KB",
+          "pacer_doc_id": "04509920633",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        },
+        {
+          "attachment_number": 5,
+          "description": "Exhibit E",
+          "file_size_str": "79.7 KB",
+          "pacer_doc_id": "04509920634",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        },
+        {
+          "attachment_number": 6,
+          "description": "Exhibit F",
+          "file_size_str": "80.0 KB",
+          "pacer_doc_id": "04509920635",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        },
+        {
+          "attachment_number": 7,
+          "description": "Exhibit G",
+          "file_size_str": "52.9 KB",
+          "pacer_doc_id": "04509920636",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        },
+        {
+          "attachment_number": 8,
+          "description": "Certificate of Service",
+          "file_size_str": "312.2 KB",
+          "pacer_doc_id": "04509920637",
+          "pacer_seq_no": "2042",
+          "page_count": 1
+        }
+      ],
       "date_entered": "2023-05-09",
       "date_filed": "2023-05-09",
       "description": "REDACTED DOCUMENT- Plaintiff States' Motion for Leave to File a Supplemental Response to Certain Questions of the Court at Oral Argument to 584 Sealed Document, by STATE OF COLORADO. (Attachments: # 1 Memorandum in Support A, # 2 Exhibit B, # 3 Exhibit C, # 4 Exhibit D, # 5 Exhibit E, # 6 Exhibit F, # 7 Exhibit G, # 8 Certificate of Service)(Sallet, Jonathan) (Entered: 05/09/2023)",