diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py
index 58868eef6..22cdcb210 100644
--- a/juriscraper/pacer/appellate_docket.py
+++ b/juriscraper/pacer/appellate_docket.py
@@ -2,8 +2,9 @@
import re
import sys
from collections import OrderedDict
+from typing import Optional
-from lxml.html import tostring
+from lxml import html
from ..lib.judge_parsers import normalize_judge_string
from ..lib.log_tools import make_default_logger
@@ -18,6 +19,8 @@
from .reports import BaseReport
from .utils import (
get_court_id_from_url,
+ get_file_size_str_from_tr,
+ get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
is_pdf,
)
@@ -562,17 +565,20 @@ def parties(self):
self._parties = parties
return parties
- def _get_attachment_number(self, row):
+ def _get_attachment_number(self, row: html.HtmlElement) -> int:
"""Return the attachment number for an item.
- In district courts, this can be easily extracted. In bankruptcy courts,
- you must extract it, then subtract 1 from the value since these are
- tallied and include the main document.
+ :param row: Table row as an lxml element
+ :return: Attachment number for row
"""
return int(row.xpath(".//td/text()")[0].strip())
- def _get_description_from_tr(self, row):
- """Get the description from the row"""
+ def _get_description_from_tr(self, row: html.HtmlElement) -> str:
+ """Get the description from the row
+
+ :param row: Table row
+ :return: Attachment description
+ """
description_text_nodes = row.xpath(f"./td[4]//text()")
if not description_text_nodes:
# No text in the cell.
@@ -581,38 +587,20 @@ def _get_description_from_tr(self, row):
return force_unicode(description)
@staticmethod
- def _get_input_value_from_tr(tr, idx):
- """Take a row from the attachment table and return the input value by
- index.
- """
- try:
- input = tr.xpath(".//input")[0]
- except IndexError:
- return None
- else:
- # value="6828943 14732034 1 62576"
- # "62576" is size in bytes "1" is pages
- value = input.xpath("./@value")[0]
- split_value = value.split(" ")
- if len(split_value) != 4:
- return None
- return split_value[idx]
-
- @staticmethod
- def _get_page_count_from_tr(tr):
+ def _get_page_count_from_tr(tr: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
- count = AppellateDocketReport._get_input_value_from_tr(tr, 2)
+ count = get_input_value_from_tr(tr, 2, 4, " ")
if count is not None:
return int(count)
@staticmethod
- def _get_file_size_bytes_from_tr(tr):
+ def _get_file_size_bytes_from_tr(tr: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
- file_size_str = AppellateDocketReport._get_input_value_from_tr(tr, 3)
+ file_size_str = get_input_value_from_tr(tr, 3, 4, " ")
if file_size_str is None:
return None
file_size = int(file_size_str)
@@ -621,23 +609,11 @@ def _get_file_size_bytes_from_tr(tr):
return file_size
@staticmethod
- def _get_file_size_str_from_tr(tr):
- """Take a row from the attachment table and return the number of bytes
- as a str.
- """
- cells = tr.xpath("./td")
- last_cell_contents = cells[-1].text_content()
- units = ["kb", "mb"]
- if any(unit in last_cell_contents.lower() for unit in units):
- return last_cell_contents.strip()
- return ""
-
- @staticmethod
- def _get_pacer_doc_id(row):
+ def _get_pacer_doc_id(row: html.HtmlElement) -> str:
return row.xpath(".//a/@data-pacer-doc-id")
@staticmethod
- def _get_pacer_seq_no_from_tr(row):
+ def _get_pacer_seq_no_from_tr(row: html.HtmlElement) -> Optional[str]:
"""Take a row of the attachment table, and return the sequence number
from the name attribute.
"""
@@ -666,7 +642,7 @@ def _get_attachments(self, cells):
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
- "file_size_str": self._get_file_size_str_from_tr(row),
+ "file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
diff --git a/juriscraper/pacer/attachment_page.py b/juriscraper/pacer/attachment_page.py
index 6d55bdb7f..b48cf31c1 100644
--- a/juriscraper/pacer/attachment_page.py
+++ b/juriscraper/pacer/attachment_page.py
@@ -7,6 +7,8 @@
from .reports import BaseReport
from .utils import (
get_court_id_from_doc_id_prefix,
+ get_file_size_str_from_tr,
+ get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
reverse_goDLS_function,
)
@@ -93,15 +95,13 @@ def data(self):
file_size_bytes = self._get_file_size_bytes_from_tr(first_row)
if file_size_bytes is not None:
result["file_size_bytes"] = file_size_bytes
- result["file_size_str"] = self._get_file_size_str_from_tr(
- first_row
- )
+ result["file_size_str"] = get_file_size_str_from_tr(first_row)
for row in rows:
attachment = {
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
- "file_size_str": self._get_file_size_str_from_tr(row),
+ "file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
@@ -272,30 +272,12 @@ def _get_description_from_tr(self, row):
description = description_text_nodes[0].strip()
return force_unicode(description)
- @staticmethod
- def _get_input_value_from_tr(tr, idx):
- """Take a row from the attachment table and return the input value by
- index.
- """
- try:
- input = tr.xpath(".//input")[0]
- except IndexError:
- return None
- else:
- # initial value string "23515655-90555-2"
- # "90555" is size in bytes "2" is pages
- value = input.xpath("./@value")[0]
- split_value = value.split("-")
- if len(split_value) != 3:
- return None
- return split_value[idx]
-
@staticmethod
def _get_page_count_from_tr_input_value(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
- count = AttachmentPage._get_input_value_from_tr(tr, 2)
+ count = get_input_value_from_tr(tr, 2, 3, "-")
if count is not None:
return int(count)
@@ -327,7 +309,7 @@ def _get_file_size_bytes_from_tr(tr):
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
- file_size_str = AttachmentPage._get_input_value_from_tr(tr, 1)
+ file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
if file_size_str is None:
return None
file_size = int(file_size_str)
@@ -335,18 +317,6 @@ def _get_file_size_bytes_from_tr(tr):
return None
return file_size
- @staticmethod
- def _get_file_size_str_from_tr(tr):
- """Take a row from the attachment table and return the number of bytes
- as an int.
- """
- cells = tr.xpath("./td")
- last_cell_contents = cells[-1].text_content()
- units = ["kb", "mb"]
- if any(unit in last_cell_contents.lower() for unit in units):
- return last_cell_contents.strip()
- return ""
-
@staticmethod
def _get_pacer_doc_id(row):
"""Take in a row from the attachment table and return the pacer_doc_id
diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py
index 20201dad2..97b74930e 100644
--- a/juriscraper/pacer/docket_report.py
+++ b/juriscraper/pacer/docket_report.py
@@ -21,6 +21,8 @@
from .docket_utils import normalize_party_types
from .reports import BaseReport
from .utils import (
+ get_file_size_str_from_tr,
+ get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
get_pacer_seq_no_from_doc1_anchor,
)
@@ -1177,30 +1179,12 @@ def _get_attachment_id_value_from_tr(tr, idx):
return None
return split_value[idx]
- @staticmethod
- def _get_input_value_from_tr(tr, idx):
- """Take a row from the attachment table and return the input value by
- index.
- """
- try:
- input = tr.xpath(".//input")[0]
- except IndexError:
- return None
- else:
- # initial value string "23515655-90555-2"
- # "90555" is size in bytes "2" is pages
- value = input.xpath("./@value")[0]
- split_value = value.split("-")
- if len(split_value) != 3:
- return None
- return split_value[idx]
-
@staticmethod
def _get_page_count_from_tr_input_value(tr):
"""Take a row from the attachment table and return the page count as an
int extracted from the input value.
"""
- count = DocketReport._get_input_value_from_tr(tr, 2)
+ count = get_input_value_from_tr(tr, 2, 3, "-")
if count is not None:
return int(count)
@@ -1238,7 +1222,7 @@ def _get_file_size_bytes_from_tr(tr):
tr, 1
)
else:
- file_size_str = DocketReport._get_input_value_from_tr(tr, 1)
+ file_size_str = get_input_value_from_tr(tr, 1, 3, "-")
if file_size_str is None:
return None
file_size = int(file_size_str)
@@ -1246,18 +1230,6 @@ def _get_file_size_bytes_from_tr(tr):
return None
return file_size
- @staticmethod
- def _get_file_size_str_from_tr(tr):
- """Take a row from the attachment table and return the number of bytes
- as a str.
- """
- cells = tr.xpath("./td")
- last_cell_contents = cells[-1].text_content()
- units = ["kb", "mb"]
- if any(unit in last_cell_contents.lower() for unit in units):
- return last_cell_contents.strip()
- return ""
-
def _get_pacer_doc_id(self, row):
"""Take in a row from the attachment table and return the pacer_doc_id
for the item in that row. Return None if the ID cannot be found.
@@ -1275,7 +1247,7 @@ def _get_pacer_doc_id(self, row):
if value:
pacer_doc_suffix = value[0]
else:
- pacer_doc_suffix = DocketReport._get_input_value_from_tr(row, 0)
+ pacer_doc_suffix = get_input_value_from_tr(row, 0, 3, "-")
if pacer_doc_suffix is None:
return None
# after inserting prefixes our final doc_id is "035023515655"
@@ -1315,7 +1287,7 @@ def _get_attachments(self, cells):
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
- "file_size_str": self._get_file_size_str_from_tr(row),
+ "file_size_str": get_file_size_str_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
# It may not be needed to reparse the seq_no
# for each row, but we may as well. So far, it
diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
index a4f2c1d03..25986967a 100644
--- a/juriscraper/pacer/utils.py
+++ b/juriscraper/pacer/utils.py
@@ -853,3 +853,44 @@ def parse_sumDocSelected_from_row(
if onclick and "sumDocSelected" in onclick[0]:
return reverse_sumDocSelected_function(onclick[0])
return None
+
+
+def get_input_value_from_tr(
+ tr: html.HtmlElement, idx: int, expected_values: int, split_value: str
+) -> Optional[str]:
+ """Take a row from the attachment table and return the input value by
+ index.
+
+ :param tr: An HTML row element from which the input value will be extracted.
+ :param idx: The index of the value to extract from the split list.
+ :param expected_values: The expected number of elements in the split value.
+ :param split_value: The delimiter used to split the value string.
+ :return: The extracted value at the specified index or None
+ """
+ try:
+ input_element = tr.xpath(".//input")[0]
+ except IndexError:
+ return None
+ else:
+ # value="6828943 14732034 1 62576"
+ # "62576" is size in bytes "1" is pages
+ # or
+ # value="23515655-90555-2"
+ # "90555" is size in bytes "2" is pages
+ value = input_element.xpath("./@value")[0]
+ split_value = value.split(split_value)
+ if len(split_value) != expected_values:
+ return None
+ return split_value[idx]
+
+
+def get_file_size_str_from_tr(tr: html.HtmlElement) -> str:
+ """Take a row from the attachment table and return the number of bytes
+ as an int.
+ """
+ cells = tr.xpath("./td")
+ last_cell_contents = cells[-1].text_content()
+ units = ["kb", "mb"]
+ if any(unit in last_cell_contents.lower() for unit in units):
+ return last_cell_contents.strip()
+ return ""
diff --git a/tests/examples/pacer/dockets/appellate/ca1_46307.html b/tests/examples/pacer/dockets/appellate/ca1_46307.html
new file mode 100644
index 000000000..7370c9a66
--- /dev/null
+++ b/tests/examples/pacer/dockets/appellate/ca1_46307.html
@@ -0,0 +1,667 @@
+
+
+
+Weston Robert Sager [NTC Government - Other] (see above)
+
+
+
WINDHAM POLICE DEPARTMENT
+
+Defendant - Appellee
+
+
+
+Eric Alexander Maher Direct: 603-778-0686 [NTC Retained] (see above)
+
+Weston Robert Sager [NTC Government - Other] (see above)
+
+
+
WINDHAM, NH
+
+Defendant - Appellee
+
+
+
+Eric Alexander Maher Direct: 603-778-0686 [NTC Retained] (see above)
+
+Weston Robert Sager [NTC Government - Other] (see above)
+
+
+
+
+
+
+
+
+ PAUL MARAVELIAS
+
+Plaintiff - Appellant
+
+v.
+
+JOHN J. COUGHLIN, Senior Judge, 10th Circuit Court- District Division, in his individual and official capacity; GORDON J. MACDONALD, New Hampshire Attorney General; PATRICIA G. CONWAY, Rockingham County Attorney, in her official capacity; TOWN OF WINDHAM, NH; GERALD S. LEWIS, Chief of Police, Town of Windham, in his official capacity, WINDHAM POLICE DEPARTMENT
+
+Defendants - Appellees
+
+