Merge pull request #753 from ttys0dev/appelate-attachments

freelawproject · Dec 31, 2024 · 66a4522 · 66a4522
2 parents dde9d68 + a1d3db4
commit 66a4522
Show file tree

Hide file tree

Showing 13 changed files with 2,847 additions and 149 deletions.
diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py
@@ -2,8 +2,10 @@
 import re
 import sys
 from collections import OrderedDict
+from typing import Any, Dict, List, Optional
 
-from lxml.html import tostring
+from lxml import html
+from lxml.etree import _ElementUnicodeResult
 
 from ..lib.judge_parsers import normalize_judge_string
 from ..lib.log_tools import make_default_logger
@@ -18,6 +20,8 @@
 from .reports import BaseReport
 from .utils import (
     get_court_id_from_url,
+    get_file_size_str_from_tr,
+    get_input_value_from_tr,
     get_pacer_doc_id_from_doc1_url,
     is_pdf,
 )
@@ -562,6 +566,97 @@ def parties(self):
         self._parties = parties
         return parties
 
+    def _get_attachment_number(self, row: html.HtmlElement) -> int:
+        """Return the attachment number for an item.
+
+        :param row: Table row as an lxml element
+        :return: Attachment number for row
+        """
+        return int(row.xpath(".//td/text()")[0].strip())
+
+    def _get_description_from_tr(self, row: html.HtmlElement) -> str:
+        """Get the description from the row
+
+        :param row: Table row
+        :return: Attachment description
+        """
+        description_text_nodes = row.xpath(f"./td[4]//text()")
+        if not description_text_nodes:
+            # No text in the cell.
+            return ""
+        description = description_text_nodes[0].strip()
+        return force_unicode(description)
+
+    @staticmethod
+    def _get_page_count_from_tr(tr: html.HtmlElement) -> Optional[int]:
+        """Take a row from the attachment table and return the page count as an
+        int extracted from the input value.
+        """
+        count = get_input_value_from_tr(tr, 2, 4, " ")
+        if count is not None:
+            return int(count)
+
+    @staticmethod
+    def _get_file_size_bytes_from_tr(tr: html.HtmlElement) -> Optional[int]:
+        """Take a row from the attachment table and return the number of bytes
+        as an int.
+        """
+        file_size_str = get_input_value_from_tr(tr, 3, 4, " ")
+        if file_size_str is None:
+            return None
+        file_size = int(file_size_str)
+        if file_size == 0:
+            return None
+        return file_size
+
+    @staticmethod
+    def _get_pacer_doc_id(row: html.HtmlElement) -> str:
+        return row.xpath(".//a/@data-pacer-doc-id")
+
+    @staticmethod
+    def _get_pacer_seq_no_from_tr(row: html.HtmlElement) -> Optional[str]:
+        """Take a row of the attachment table, and return the sequence number
+        from the name attribute.
+        """
+        try:
+            input = row.xpath(".//input")[0]
+        except IndexError:
+            # No link in the row. Maybe its sealed.
+            pass
+        else:
+            try:
+                name = input.xpath("./@value")[0]
+            except IndexError:
+                # No onclick on this row.
+                pass
+            else:
+                return name.split(" ")[0]
+
+        return None
+
+    def _get_attachments(
+        self, cells: html.HtmlElement
+    ) -> List[Dict[str, Any]]:
+        rows = cells.xpath("./table//tr//tr")[1:]
+        result = []
+        for row in rows:
+            attachment = {
+                "attachment_number": self._get_attachment_number(row),
+                "description": self._get_description_from_tr(row),
+                "page_count": self._get_page_count_from_tr(row),
+                "file_size_str": get_file_size_str_from_tr(row),
+                "pacer_doc_id": self._get_pacer_doc_id(row),
+                # It may not be needed to reparse the seq_no
+                # for each row, but we may as well. So far, it
+                # has always been the same as the main document.
+                "pacer_seq_no": self._get_pacer_seq_no_from_tr(row),
+            }
+            file_size_bytes = self._get_file_size_bytes_from_tr(row)
+            if file_size_bytes is not None:
+                attachment["file_size_bytes"] = file_size_bytes
+            result.append(attachment)
+        return result
+
     @property
     def docket_entries(self):
         """Get the docket entries"""
@@ -577,19 +672,38 @@ def docket_entries(self):
         )
         docket_entry_rows = self.tree.xpath(path)
 
+        # Detect if the report was generated with "View multiple documents"
+        # option enabled.
+        view_multiple_documents = False
+        view_selected_btn = self.tree.xpath("//input[@value='View Selected']")
+        if view_selected_btn:
+            view_multiple_documents = True
         docket_entries = []
         for row in docket_entry_rows:
             de = {}
             cells = row.xpath("./td")
+            if len(cells) == 0:
+                continue
             if len(cells) == 1:
                 if cells[0].text_content() == "No docket entries found.":
                     break
                 continue
 
             date_filed_str = force_unicode(cells[0].text_content())
+            if not date_filed_str.strip():
+                if view_multiple_documents and len(cells) >= 3:
+                    last_de = docket_entries[-1]
+                    attachments = self._get_attachments(cells[2])
+                    if len(attachments) == 0:
+                        continue
+                    last_de["attachments"] = attachments
+                    continue
             de["date_filed"] = convert_date_string(date_filed_str)
             de["document_number"] = self._get_document_number(cells[1])
             de["pacer_doc_id"] = self._get_pacer_doc_id(cells[1])
+            pacer_seq_no = self._get_pacer_seq_no(cells[1])
+            if pacer_seq_no is not None:
+                de["pacer_seq_no"] = str(pacer_seq_no)
             if not de["document_number"]:
                 if de["pacer_doc_id"]:
                     # If we lack the document number, but have
@@ -629,6 +743,20 @@ def _get_pacer_doc_id(cell):
             doc1_url = urls[0].xpath("./@href")[0]
             return get_pacer_doc_id_from_doc1_url(doc1_url)
 
+    @staticmethod
+    def _get_pacer_seq_no(
+        cell: html.HtmlElement,
+    ) -> Optional[_ElementUnicodeResult]:
+        """Take a row from the attachment table and return the input value by
+        index.
+        """
+        try:
+            input = cell.xpath(".//input")[0]
+        except IndexError:
+            return None
+        else:
+            return input.xpath("./@value")[0]
+
     def _get_case_name(self):
         """Get the case name."""
 

diff --git a/juriscraper/pacer/attachment_page.py b/juriscraper/pacer/attachment_page.py
@@ -7,6 +7,8 @@
 from .reports import BaseReport
 from .utils import (
     get_court_id_from_doc_id_prefix,
+    get_file_size_str_from_tr,
+    get_input_value_from_tr,
     get_pacer_doc_id_from_doc1_url,
     reverse_goDLS_function,
 )
@@ -93,15 +95,13 @@ def data(self):
             file_size_bytes = self._get_file_size_bytes_from_tr(first_row)
             if file_size_bytes is not None:
                 result["file_size_bytes"] = file_size_bytes
-            result["file_size_str"] = self._get_file_size_str_from_tr(
-                first_row
-            )
+            result["file_size_str"] = get_file_size_str_from_tr(first_row)
         for row in rows:
             attachment = {
                 "attachment_number": self._get_attachment_number(row),
                 "description": self._get_description_from_tr(row),
                 "page_count": self._get_page_count_from_tr(row),
-                "file_size_str": self._get_file_size_str_from_tr(row),
+                "file_size_str": get_file_size_str_from_tr(row),
                 "pacer_doc_id": self._get_pacer_doc_id(row),
                 # It may not be needed to reparse the seq_no
                 # for each row, but we may as well. So far, it
@@ -272,30 +272,12 @@ def _get_description_from_tr(self, row):
         description = description_text_nodes[0].strip()
         return force_unicode(description)
 
-    @staticmethod
-    def _get_input_value_from_tr(tr, idx):
-        """Take a row from the attachment table and return the input value by
-        index.
-        """
-        try:
-            input = tr.xpath(".//input")[0]
-        except IndexError:
-            return None
-        else:
-            # initial value string "23515655-90555-2"
-            # "90555" is size in bytes "2" is pages
-            value = input.xpath("./@value")[0]
-            split_value = value.split("-")
-            if len(split_value) != 3:
-                return None
-            return split_value[idx]
-
     @staticmethod
     def _get_page_count_from_tr_input_value(tr):
         """Take a row from the attachment table and return the page count as an
         int extracted from the input value.
         """
-        count = AttachmentPage._get_input_value_from_tr(tr, 2)
+        count = get_input_value_from_tr(tr, 2, 3, "-")
         if count is not None:
             return int(count)
 
@@ -327,26 +309,14 @@ def _get_file_size_bytes_from_tr(tr):
         """Take a row from the attachment table and return the number of bytes
         as an int.
         """
-        file_size_str = AttachmentPage._get_input_value_from_tr(tr, 1)
+        file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
         if file_size_str is None:
             return None
         file_size = int(file_size_str)
         if file_size == 0:
             return None
         return file_size
 
-    @staticmethod
-    def _get_file_size_str_from_tr(tr):
-        """Take a row from the attachment table and return the number of bytes
-        as an int.
-        """
-        cells = tr.xpath("./td")
-        last_cell_contents = cells[-1].text_content()
-        units = ["kb", "mb"]
-        if any(unit in last_cell_contents.lower() for unit in units):
-            return last_cell_contents.strip()
-        return ""
-
     @staticmethod
     def _get_pacer_doc_id(row):
         """Take in a row from the attachment table and return the pacer_doc_id

diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py
@@ -21,6 +21,8 @@
 from .docket_utils import normalize_party_types
 from .reports import BaseReport
 from .utils import (
+    get_file_size_str_from_tr,
+    get_input_value_from_tr,
     get_pacer_doc_id_from_doc1_url,
     get_pacer_seq_no_from_doc1_anchor,
 )
@@ -1177,30 +1179,12 @@ def _get_attachment_id_value_from_tr(tr, idx):
                 return None
             return split_value[idx]
 
-    @staticmethod
-    def _get_input_value_from_tr(tr, idx):
-        """Take a row from the attachment table and return the input value by
-        index.
-        """
-        try:
-            input = tr.xpath(".//input")[0]
-        except IndexError:
-            return None
-        else:
-            # initial value string "23515655-90555-2"
-            # "90555" is size in bytes "2" is pages
-            value = input.xpath("./@value")[0]
-            split_value = value.split("-")
-            if len(split_value) != 3:
-                return None
-            return split_value[idx]
-
     @staticmethod
     def _get_page_count_from_tr_input_value(tr):
         """Take a row from the attachment table and return the page count as an
         int extracted from the input value.
         """
-        count = DocketReport._get_input_value_from_tr(tr, 2)
+        count = get_input_value_from_tr(tr, 2, 3, "-")
         if count is not None:
             return int(count)
 
@@ -1238,26 +1222,14 @@ def _get_file_size_bytes_from_tr(tr):
                 tr, 1
             )
         else:
-            file_size_str = DocketReport._get_input_value_from_tr(tr, 1)
+            file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
         if file_size_str is None:
             return None
         file_size = int(file_size_str)
         if file_size == 0:
             return None
         return file_size
 
-    @staticmethod
-    def _get_file_size_str_from_tr(tr):
-        """Take a row from the attachment table and return the number of bytes
-        as a str.
-        """
-        cells = tr.xpath("./td")
-        last_cell_contents = cells[-1].text_content()
-        units = ["kb", "mb"]
-        if any(unit in last_cell_contents.lower() for unit in units):
-            return last_cell_contents.strip()
-        return ""
-
     def _get_pacer_doc_id(self, row):
         """Take in a row from the attachment table and return the pacer_doc_id
         for the item in that row. Return None if the ID cannot be found.
@@ -1275,7 +1247,7 @@ def _get_pacer_doc_id(self, row):
             if value:
                 pacer_doc_suffix = value[0]
         else:
-            pacer_doc_suffix = DocketReport._get_input_value_from_tr(row, 0)
+            pacer_doc_suffix = get_input_value_from_tr(row, 0, 3, "-")
         if pacer_doc_suffix is None:
             return None
         # after inserting prefixes our final doc_id is "035023515655"
@@ -1315,7 +1287,7 @@ def _get_attachments(self, cells):
                 "attachment_number": self._get_attachment_number(row),
                 "description": self._get_description_from_tr(row),
                 "page_count": self._get_page_count_from_tr(row),
-                "file_size_str": self._get_file_size_str_from_tr(row),
+                "file_size_str": get_file_size_str_from_tr(row),
                 "pacer_doc_id": self._get_pacer_doc_id(row),
                 # It may not be needed to reparse the seq_no
                 # for each row, but we may as well. So far, it

diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
@@ -853,3 +853,44 @@ def parse_sumDocSelected_from_row(
         if onclick and "sumDocSelected" in onclick[0]:
             return reverse_sumDocSelected_function(onclick[0])
     return None
+
+
+def get_input_value_from_tr(
+    tr: html.HtmlElement, idx: int, expected_values: int, split_value: str
+) -> Optional[str]:
+    """Take a row from the attachment table and return the input value by
+    index.
+
+    :param tr: An HTML row element from which the input value will be extracted.
+    :param idx: The index of the value to extract from the split list.
+    :param expected_values: The expected number of elements in the split value.
+    :param split_value: The delimiter used to split the value string.
+    :return: The extracted value at the specified index or None
+    """
+    try:
+        input_element = tr.xpath(".//input")[0]
+    except IndexError:
+        return None
+    else:
+        # value="6828943 14732034 1 62576"
+        # "62576" is size in bytes "1" is pages
+        # or
+        # value="23515655-90555-2"
+        # "90555" is size in bytes "2" is pages
+        value = input_element.xpath("./@value")[0]
+        split_value = value.split(split_value)
+        if len(split_value) != expected_values:
+            return None
+        return split_value[idx]
+
+
+def get_file_size_str_from_tr(tr: html.HtmlElement) -> str:
+    """Take a row from the attachment table and return the number of bytes
+    as an int.
+    """
+    cells = tr.xpath("./td")
+    last_cell_contents = cells[-1].text_content()
+    units = ["kb", "mb"]
+    if any(unit in last_cell_contents.lower() for unit in units):
+        return last_cell_contents.strip()
+    return ""