diff --git a/capstone/capapi/resources.py b/capstone/capapi/resources.py
index 75685a965..570fe650b 100644
--- a/capstone/capapi/resources.py
+++ b/capstone/capapi/resources.py
@@ -1,6 +1,6 @@
import hashlib
import concurrent.futures
-from copy import copy
+from copy import copy
from functools import reduce
import rest_framework.request
@@ -13,6 +13,7 @@
from django.db.models import QuerySet
from django.http import QueryDict
from django.test.utils import CaptureQueriesContext
+from django.test.client import RequestFactory
from django.utils.functional import SimpleLazyObject
from django_hosts import reverse as django_hosts_reverse
from elasticsearch import Elasticsearch
@@ -269,3 +270,11 @@ def api_request(request, viewset, method, url_kwargs={}, get_params={}):
api_request.GET.update(get_params)
return viewset.as_view({'get': method})(api_request, **url_kwargs)
+
+
+def call_serializer(Serializer, item, query_params=None):
+ """
+ Make a fake DRF request so we can call a DRF serializer with the expected context.
+ """
+ request = rest_framework.request.Request(RequestFactory().get('/', query_params))
+ return Serializer(item, context={'request': request}).data
diff --git a/capstone/capapi/serializers.py b/capstone/capapi/serializers.py
index 3115802d7..0870154ce 100644
--- a/capstone/capapi/serializers.py
+++ b/capstone/capapi/serializers.py
@@ -546,49 +546,9 @@ def data(self):
return super(DocumentSerializer, self).data
-class ConvertCaseDocumentSerializer(CaseDocumentSerializer):
- first_page_order = serializers.CharField()
- last_page_order = serializers.CharField()
-
- def to_representation(self, instance):
- first_page_order = self.context.get("first_page_order")
- last_page_order = self.context.get("last_page_order")
-
- data = super().to_representation(instance)
-
- data.pop("reporter", None)
- data.pop("volume", None)
- data.pop("url", None)
- data.pop("frontend_url", None)
- data.pop("frontend_pdf_url", None)
- try:
- data["court"].pop("slug", None)
- data["court"].pop("url", None)
- except KeyError as err:
- print(f"Cannot pop field {err} because 'court' doesn't exist")
- try:
- data["jurisdiction"].pop("slug", None)
- data["jurisdiction"].pop("whitelisted", None)
- data["jurisdiction"].pop("url", None)
- except KeyError as err:
- print(f"Cannot pop field {err} because 'jurisdiction' doesn't exist")
-
- if "preview" in data:
- data.pop("preview")
- data["first_page_order"] = first_page_order
- data["last_page_order"] = last_page_order
- return data
-
-
class ConvertNoLoginCaseDocumentSerializer(CaseDocumentSerializerWithCasebody):
- first_page_order = serializers.CharField()
- last_page_order = serializers.CharField()
-
def to_representation(self, instance, check_permissions=False):
"""Tell get_casebody not to check for case download permissions."""
- first_page_order = self.context.get("first_page_order")
- last_page_order = self.context.get("last_page_order")
-
data = super().to_representation(instance, check_permissions=check_permissions)
try:
data["casebody"] = data["casebody"]["data"]
@@ -615,8 +575,6 @@ def to_representation(self, instance, check_permissions=False):
data.pop("preview", None)
- data["first_page_order"] = first_page_order
- data["last_page_order"] = last_page_order
return data
@property
diff --git a/capstone/capdb/tests/test_models.py b/capstone/capdb/tests/test_models.py
index d901c8795..998782fad 100644
--- a/capstone/capdb/tests/test_models.py
+++ b/capstone/capdb/tests/test_models.py
@@ -93,96 +93,8 @@ def test_volume_save_slug_update(volume_metadata):
@pytest.mark.django_db(databases=["capdb"])
-def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
- # set up a redacted case
- case = case_factory(volume__redacted=True, volume__pdf_file="redacted_volume.pdf")
- structure = case.structure
- page = structure.pages.first()
- structure.opinions = [
- # redacted paragraph
- {
- "type": "head",
- "paragraphs": [
- {
- "class": "parties",
- "block_ids": ["BL_1.1"],
- "id": "b1-1",
- "redacted": True,
- }
- ],
- },
- {
- "type": "majority",
- "paragraphs": [
- # redacted content blocks
- {
- "class": "p",
- "block_ids": ["BL_1.2", "BL_1.3"],
- "id": "b1-2",
- },
- # redacted image block
- {
- "class": "image",
- "block_ids": ["BL_1.4"],
- "id": "b1-3",
- },
- ],
- # redacted footnote
- "footnotes": [
- {
- # redacted footnote paragraph
- "paragraphs": [
- {
- "class": "p",
- "block_ids": ["BL_1.5"],
- "id": "b1-4",
- }
- ],
- "label": "1",
- "id": "footnote_1_1",
- "redacted": True,
- }
- ],
- },
- ]
- structure.save()
- page.blocks = [
- {
- "id": "BL_1.1",
- "class": "p",
- "tokens": ["Text 1"],
- "rect": [25, 11, 300, 490],
- },
- {
- "id": "BL_1.2",
- "class": "p",
- "tokens": ["Text 2"],
- "redacted": True,
- "rect": [4, 32, 100, 100],
- },
- {
- "id": "BL_1.3",
- "class": "p",
- "tokens": [["redact"], "Text 3", ["/redact"]],
- "rect": [225, 11, 430, 290],
- },
- {
- "id": "BL_1.4",
- "format": "image",
- "redacted": True,
- "class": "image",
- "data": "image data",
- "rect": [0, 0, 100, 100],
- },
- {
- "id": "BL_1.5",
- "class": "p",
- "tokens": ["Text 4"],
- "rect": [190, 312, 330, 490],
- },
- ]
- page.encrypt()
- page.save()
+def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path, redacted_case_factory):
+ case = redacted_case_factory()
# set up volume pdfs
volume = case.volume
@@ -194,15 +106,16 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
download_files_storage.save(volume.pdf_file.name, StringIO("redacted"))
# verify redacted case contents
- case.sync_case_body_cache()
- case.refresh_from_db()
- assert case.body_cache.text == "\n\n"
+ assert case.body_cache.text == "\nnot redacted\n"
assert xml_equal(
case.body_cache.html,
- '",
+ '\n'
+ ' \n'
+ ' \n'
+ ' not redacted
\n'
+ ' \n'
+ '\n',
)
# unredact
@@ -210,7 +123,7 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
volume.refresh_from_db()
case.body_cache.refresh_from_db()
assert volume.redacted is False
- assert case.body_cache.text == "Text 1\nText 2Text 3\nText 4\n"
+ assert case.body_cache.text == "Text 1\nText 2Text 3not redacted\nText 4\n"
assert html_equal(
case.body_cache.html,
dedent(
@@ -220,7 +133,7 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
Text 1
- Text 2Text 3
+ Text 2Text 3not redacted
diff --git a/capstone/config/celery.py b/capstone/config/celery.py
index f4df3a723..b27378866 100644
--- a/capstone/config/celery.py
+++ b/capstone/config/celery.py
@@ -17,7 +17,7 @@
'scripts.update_snippets',
'scripts.refactor_xml',
'scripts.make_pdf',
- 'scripts.convert_s3',
+ 'scripts.export_cap_static',
])
# Using a string here means the worker doesn't have to serialize
diff --git a/capstone/config/settings/settings_base.py b/capstone/config/settings/settings_base.py
index 619f5d2c8..1f3b40552 100644
--- a/capstone/config/settings/settings_base.py
+++ b/capstone/config/settings/settings_base.py
@@ -385,6 +385,10 @@ def immutable_file_test(path, url):
},
}
CELERY_TIMEZONE = 'UTC'
+CELERY_TASK_ROUTES = {
+ "scripts.export_cap_static.export_cases_by_volume": {"queue": "cap_static"},
+}
+
### CAP API settings ###
diff --git a/capstone/fabfile.py b/capstone/fabfile.py
index 2bc394af5..9d5444db1 100644
--- a/capstone/fabfile.py
+++ b/capstone/fabfile.py
@@ -7,6 +7,7 @@
import subprocess
import sys
import tempfile
+import traceback
from collections import defaultdict
from contextlib import contextmanager
from datetime import datetime
@@ -61,8 +62,8 @@
validate_private_volumes as validate_private_volumes_script,
export,
update_snippets,
+ export_cap_static,
)
-from scripts import convert_s3
from scripts.helpers import (
copy_file,
volume_barcode_from_folder,
@@ -298,6 +299,7 @@ def import_web_volumes():
try:
import_volume(f.name)
except IntegrityError:
+ traceback.print_exc()
print(" - integrity error; volume already imported? skipping")
@@ -452,31 +454,30 @@ def retry_export_cases(version_string):
@task
-def export_cases_to_s3(reporter="528"):
+def export_cap_static_cases(dest_dir="/tmp/cap_exports", reporter=None, volume=None, last_run_before=None):
"""
- Export a version to S3 of all cases' texts and metadata
- by reporter and volume.
+ First step of the static files export process: export cases, one celery task per volume.
"""
- redacted = True
- bucket = convert_s3.get_bucket_name(redacted)
- convert_s3.export_cases_to_s3(bucket, redacted, reporter)
-
-
-@task
-def export_reporters_to_s3():
- """
- Run export of all reporters and their contents to S3.
- """
- convert_s3.put_reporters_on_s3(redacted=True)
+ print("Scheduling tasks to reindex volumes")
+ volumes = VolumeMetadata.objects.exclude(out_of_scope=True)
+ if volume:
+ volumes = volumes.filter(pk=volume)
+ if reporter:
+ volumes = volumes.filter(reporter_id=reporter)
+ tasks.run_task_for_volumes(
+ export_cap_static.export_cases_by_volume,
+ volumes,
+ last_run_before=last_run_before,
+ dest_dir=dest_dir,
+ )
@task
-def export_reporters_to_s3_trial():
+def summarize_cap_static(dest_dir="/tmp/cap_exports"):
"""
- Run export of all reporters and their contents to S3
- for first API page.
+ Second step of the static files export process: add summary files at the reporter level and top level.
"""
- convert_s3.put_reporters_on_s3_trial(redacted=True)
+ export_cap_static.finalize_reporters(dest_dir)
@task
diff --git a/capstone/scripts/convert_s3.py b/capstone/scripts/convert_s3.py
deleted file mode 100644
index 1179595af..000000000
--- a/capstone/scripts/convert_s3.py
+++ /dev/null
@@ -1,522 +0,0 @@
-import base64
-import boto3
-import hashlib
-import json
-import requests
-from botocore.exceptions import ClientError
-from collections import namedtuple
-from celery import group, shared_task
-
-from capapi.documents import CaseDocument
-from capapi.serializers import (
- ConvertNoLoginCaseDocumentSerializer,
-)
-from capdb.models import Reporter, VolumeMetadata
-
-s3_client = boto3.client("s3")
-api_endpoint = "https://api.case.law/v1/"
-
-
-def put_reporters_on_s3_trial(redacted: bool) -> None:
- """
- Kicks off the full cascading S3 file creation series
- for a subsection of reporters.
- """
- # set bucket name for all operations
- bucket = get_bucket_name(redacted)
-
- current_endpoint = f"{api_endpoint}reporters/"
- print("Converting files from ", current_endpoint)
- response = requests.get(current_endpoint)
- results = response.json()
- reporters_metadata = ""
- all_volumes_metadata = ""
-
- # write each entry into jsonl
- for result in results["results"]:
- # for each reporter, kick off cascading export to S3
- reporter_metadata, subset_volumes_metadata = export_cases_to_s3(
- bucket, redacted, result["id"]
- )
- reporters_metadata += reporter_metadata
- all_volumes_metadata += subset_volumes_metadata
-
- # uploads all reporters metadata to top level
- hash_and_upload(
- reporters_metadata,
- bucket,
- "ReportersMetadata.jsonl",
- "application/jsonl",
- )
-
- # uploads all volumes metadata to top level
- hash_and_upload(
- all_volumes_metadata,
- bucket,
- "VolumesMetadata.jsonl",
- "application/jsonl",
- )
-
-
-def put_reporters_on_s3(redacted: bool) -> None:
- """
- Kicks off the full cascading file creation series.
- """
- # set bucket name for all operations
- bucket = get_bucket_name(redacted)
-
- current_endpoint = f"{api_endpoint}reporters/"
- previous_cursor = None
- current_cursor = ""
- reporters_metadata = ""
- all_volumes_metadata = ""
-
- while current_endpoint:
- print("Converting files from ", current_endpoint)
- response = requests.get(current_endpoint)
- results = response.json()
-
- # write each entry into jsonl
- for result in results["results"]:
- # for each reporter, kick off cascading export to S3
- reporter_metadata, subset_volumes_metadata = export_cases_to_s3(
- bucket, redacted, result["id"]
- )
- reporters_metadata += reporter_metadata
- all_volumes_metadata += subset_volumes_metadata
-
- # update cursor to access next endpoint
- current_cursor = results["next"]
- if current_cursor != previous_cursor:
- print("Update next to: ", current_cursor)
-
- previous_cursor = current_cursor
- current_endpoint = current_cursor
-
- # uploads all reporters metadata to top level
- hash_and_upload(
- reporters_metadata,
- bucket,
- "ReportersMetadata.jsonl",
- "application/jsonl",
- )
-
- # uploads all volumes metadata to top level
- hash_and_upload(
- all_volumes_metadata,
- bucket,
- "VolumesMetadata.jsonl",
- "application/jsonl",
- )
-
-
-def export_cases_to_s3(bucket: str, redacted: bool, reporter_id: str) -> tuple:
- """
- Write .jsonl file with all cases per reporter.
- """
- reporter = Reporter.objects.get(pk=reporter_id)
-
- # Make sure there are volumes in the reporter
- if not reporter.volumes.exclude(out_of_scope=True):
- print("WARNING: Reporter '{}' contains NO VOLUMES.".format(reporter.full_name))
- # Returning empty string to have something to append to reporter metadata
- return ("", "")
-
- # Make sure there are cases in the reporter
- cases_search = CaseDocument.raw_search().filter("term", reporter__id=reporter.id)
- if cases_search.count() == 0:
- print("WARNING: Reporter '{}' contains NO CASES.".format(reporter.full_name))
- # Returning empty string to have something to append to reporter metadata
- return ("", "")
-
- # TODO: address reporters that share slug
- if reporter_id in reporter_slug_dict:
- reporter_prefix = reporter_slug_dict[reporter_id]
- else:
- reporter_prefix = reporter.short_name_slug
-
- # upload reporter metadata
- reporter_metadata = put_reporter_metadata(bucket, reporter, reporter_prefix)
-
- # get in-scope volumes with volume numbers in each reporter
- subset_volumes_metadata = ""
-
- job = group(
- export_cases_by_volume.s(
- volume=volume.barcode,
- reporter_prefix=reporter_prefix,
- dest_bucket=bucket,
- redacted=redacted,
- )
- for volume in (
- reporter.volumes.exclude(volume_number=None)
- .exclude(volume_number="")
- .exclude(out_of_scope=True)
- )
- )
-
- results = job.apply_async()
-
- for i in range(3):
- try:
- for volume_metadata in results.get():
- subset_volumes_metadata += volume_metadata
- break
- except ClientError as err:
- if err.response['Error']['Code'] == 'NoSuchKey':
- print(f'NoSuchKey in {reporter_id} on try {i + 1}')
- else:
- raise
-
- return (reporter_metadata, subset_volumes_metadata)
-
-
-@shared_task
-def export_cases_by_volume(
- volume: str, reporter_prefix: str, dest_bucket: str, redacted: bool
-) -> str:
- """
- Write a .json file for each case per volume.
- Write a .jsonl file with all cases' metadata per volume.
- Write a .jsonl file with all volume metadata for this collection.
- """
-
- volume = VolumeMetadata.objects.get(pk=volume)
-
- case_file_name_index = 1
- prev_case_first_page = None
-
- vars = {
- "serializer": ConvertNoLoginCaseDocumentSerializer,
- "query_params": {"body_format": "text"},
- }
-
- cases = list(volume.case_metadatas.select_related().order_by("case_id"))
-
- if len(cases) == 0:
- print("WARNING: Volume '{}' contains NO CASES.".format(volume.barcode))
- # Returning empty string to have something to append to volume metadata
- return ""
-
- # open each volume and put case text or metadata into file based on format
- cases_search = CaseDocument.raw_search().filter(
- "term", volume__barcode=volume.barcode
- )
-
- # create a dictionary to grab data from each CaseDocument search object
- cases_search_by_id = {
- case_search["_source"]["id"]: case_search for case_search in cases_search.scan()
- }
-
- volume_prefix = f"{reporter_prefix}/{volume.volume_number}"
- volume_metadata = put_volume_metadata(dest_bucket, volume, volume_prefix)
-
- cases_key = f"{volume_prefix}/Cases/"
-
- # fetch existing files to compare to what we have
- s3_contents_hashes = fetch_s3_files(dest_bucket, cases_key)
-
- # fake Request object used for serializing case with DRF's serializer
- vars["fake_request"] = namedtuple("Request", ["query_params", "accepted_renderer"])(
- query_params=vars["query_params"],
- accepted_renderer=None,
- )
- # fake Request object used for serializing cases with DRF's serializer
- vars["fake_request"] = namedtuple("Request", ["query_params", "accepted_renderer"])(
- query_params={"body_format": "text"},
- accepted_renderer=None,
- )
-
- # create a metadata contents string to append case metadata content
- metadata_contents = ""
-
- # store the serialized case data
- for case in cases:
- # identify associated search item to add additional data
- try:
- item = cases_search_by_id[case.id]
- except KeyError:
- continue
-
- serializer = vars["serializer"](
- item["_source"],
- context={
- "request": vars["fake_request"],
- "first_page_order": case.first_page_order,
- "last_page_order": case.last_page_order,
- },
- )
-
- # add data to metadata_contents string without 'casebody'
- metadata_data = serializer.data
- metadata_data.pop("casebody", None)
- metadata_contents += json.dumps(metadata_data) + "\n"
-
- # compose each casefile with a hash
- case_contents = json.dumps(serializer.data) + "\n"
- hash_object = hashlib.sha256(case_contents.encode("utf-8"))
- case_contents_hash = base64.b64encode(hash_object.digest()).decode()
-
- # calculate casefile name
- if prev_case_first_page == case.first_page:
- case_file_name_index += 1
- else:
- case_file_name_index = 1
- case_file_name = (
- f"{case.first_page.zfill(4)}-{str(case_file_name_index).zfill(2)}.json"
- )
-
- # set so we can use to determine multiple cases on single page
- prev_case_first_page = case.first_page
-
- # identify key: hash pair for current case
- dest_key = f"{cases_key}{case_file_name}"
- s3_key_hash = s3_contents_hashes.pop(dest_key, None)
-
- if s3_key_hash is None or s3_key_hash != case_contents_hash:
- hash_and_upload(
- case_contents,
- dest_bucket,
- dest_key,
- "application/jsonl",
- )
-
- # remove files from S3 that would otherwise create repeats
- for s3_case_key in s3_contents_hashes:
- try:
- s3_client.delete_object(
- Bucket=dest_bucket,
- Key=s3_case_key,
- )
- except ClientError as err:
- if err.response['Error']['Code'] == 'NoSuchKey':
- print(f"Couldn't delete {s3_case_key}, no such key")
- else:
- raise Exception(
- f"Couldn't delete {dest_bucket}/{s3_case_key}: %s" % err
- )
-
- hash_and_upload(
- metadata_contents,
- dest_bucket,
- f"{volume_prefix}/CasesMetadata.jsonl",
- "application/jsonl",
- )
-
- # copies each volume PDF to new location if it doesn't already exist
- copy_volume_pdf(volume, volume_prefix, dest_bucket, redacted)
- # return metadata for single volume
- return volume_metadata
-
-
-# Reporter-specific helper functions
-
-# Some reporters share a slug, so we have to differentiate with ids
-reporter_slug_dict = {
- "415": "us-ct-cl",
- "657": "wv-ct-cl",
- "580": "mass-app-div-annual",
- "576": "mass-app-div",
-}
-
-
-def put_reporter_metadata(bucket: str, reporter: object, key: str) -> str:
- """
- Write a .json file with just the reporter metadata.
- Return the line of reporter metadata to be used in all reporters metadata file.
- """
- response = requests.get(f"{api_endpoint}reporters/{reporter.id}/")
- results = response.json()
-
- # add additional fields from reporter obj
- results["harvard_hollis_id"] = reporter.hollis
-
- # remove unnecessary fields
- results.pop("url", None)
- results.pop("frontend_url", None)
- try:
- for jurisdiction in results["jurisdictions"]:
- jurisdiction.pop("slug", None)
- jurisdiction.pop("whitelisted", None)
- jurisdiction.pop("url", None)
- except KeyError as err:
- print(f"Cannot pop field {err} because 'jurisdictions' doesn't exist")
-
- reporter_metadata = json.dumps(results) + "\n"
- # add each line to reporters_metadata string
- hash_and_upload(
- reporter_metadata, bucket, f"{key}/ReporterMetadata.json", "application/json"
- )
- return reporter_metadata
-
-
-# Volume-specific helper functions
-
-
-def put_volume_metadata(bucket: str, volume: object, key: str) -> str:
- """
- Write a .json file with just the single volume metadata.
- """
- response = requests.get(f"{api_endpoint}volumes/{volume.barcode}/")
- results = response.json()
- # change "barcode" key to "id" key
- results["id"] = results.pop("barcode", None)
-
- # add additional fields from model
- results["harvard_hollis_id"] = volume.hollis_number
- results["spine_start_year"] = volume.spine_start_year
- results["spine_end_year"] = volume.spine_end_year
- results["publication_city"] = volume.publication_city
- results["second_part_of_id"] = volume.second_part_of_id
-
- # add information about volume's nominative_reporter
- if volume.nominative_reporter_id:
- results["nominative_reporter"] = {}
- results["nominative_reporter"]["id"] = volume.nominative_reporter_id
- results["nominative_reporter"][
- "short_name"
- ] = volume.nominative_reporter.short_name
- results["nominative_reporter"][
- "full_name"
- ] = volume.nominative_reporter.full_name
- results["nominative_reporter"][
- "volume_number"
- ] = volume.nominative_volume_number
- results.pop("nominative_volume_number", None)
- results.pop("nominative_name", None)
- elif volume.nominative_reporter_id is None and (
- volume.nominative_volume_number or volume.nominative_name
- ):
- results["nominative_reporter"] = {}
- results["nominative_reporter"][
- "volume_number"
- ] = volume.nominative_volume_number
- results["nominative_reporter"]["nominative_name"] = volume.nominative_name
- else:
- results["nominative_reporter"] = None
-
- # remove unnecessary fields
- results.pop("reporter", None)
- results.pop("reporter_url", None)
- results.pop("url", None)
- results.pop("pdf_url", None)
- results.pop("frontend_url", None)
- try:
- for jurisdiction in results["jurisdictions"]:
- jurisdiction.pop("slug", None)
- jurisdiction.pop("whitelisted", None)
- jurisdiction.pop("url", None)
- except KeyError as err:
- print(f"Cannot pop field {err} because 'jurisdictions' doesn't exist")
-
- volume_metadata = json.dumps(results) + "\n"
- hash_and_upload(
- volume_metadata, bucket, f"{key}/VolumeMetadata.json", "application/json"
- )
- return volume_metadata
-
-
-def copy_volume_pdf(
- volume: object, volume_prefix: str, dest_bucket: str, redacted: bool
-) -> None:
- """
- Copy PDF volume from original location to destination bucket
- """
- if redacted:
- source_prefix = "pdf/redacted"
- else:
- source_prefix = "pdf/unredacted"
-
- try:
- s3_client.head_object(Bucket=dest_bucket, Key=f"{volume_prefix}/Volume.pdf")
- print(f"{dest_bucket}/{volume_prefix}/Volume.pdf already uploaded!")
- except ClientError as err:
- if err.response["Error"]["Code"] == "404":
- # "With a copy command, the checksum of the object is a direct checksum of the full object."
- # https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html
- copy_source = {
- "Bucket": "harvard-cap-archive",
- "Key": f"{source_prefix}/{volume.barcode}.pdf",
- }
- copy_object_params = {
- "Bucket": dest_bucket,
- "Key": f"{volume_prefix}/Volume.pdf",
- "CopySource": copy_source,
- }
-
- s3_client.copy_object(**copy_object_params)
- print(
- f"Copied {source_prefix}/{volume.barcode}.pdf to \
- {volume_prefix}/Volume.pdf"
- )
- else:
- raise Exception(
- f"Cannot upload {source_prefix}/{volume.barcode}.pdf to \
- {volume_prefix}/Volume.pdf: %s"
- % err
- )
-
-
-# Case-specific helper functions
-
-
-def fetch_s3_files(bucket: str, key: str) -> dict:
- """
- Return a dictionary of bucket contents format key: hash
- """
- try:
- s3_contents_hash = {}
- response = s3_client.list_objects_v2(Bucket=bucket, Prefix=key)
- except ClientError as err:
- raise Exception(f"Cannot list objects {bucket}/{key}: %s" % err)
- if "Contents" not in response:
- return s3_contents_hash
- else:
- for case in response["Contents"]:
- # Get the object's metadata
- try:
- response = s3_client.get_object_attributes(
- Bucket=bucket, Key=case["Key"], ObjectAttributes=["Checksum"]
- )
-
- existing_hash = response.get("Checksum", {}).get("ChecksumSHA256")
- s3_contents_hash[case["Key"]] = existing_hash
- except ClientError as err:
- raise Exception(f"Cannot check file {bucket}/{case['Key']}: %s" % err)
-
- return s3_contents_hash
-
-
-# General helper functions
-
-
-def hash_and_upload(contents: str, bucket: str, key: str, content_type: str) -> None:
- """
- Hash created file and upload to S3
- """
- # Calculate the SHA256 hash of the contents data
- hash_object = hashlib.sha256(contents.encode("utf-8"))
- sha256_hash = base64.b64encode(hash_object.digest()).decode()
- # upload file to S3
- try:
- s3_client.put_object(
- Body=contents,
- Bucket=bucket,
- Key=key,
- ContentType=content_type,
- ChecksumSHA256=sha256_hash,
- )
- print(f"Completed {key}")
- except ClientError as err:
- raise Exception(f"Error uploading {key}: %s" % err)
-
-
-def get_bucket_name(redacted: bool) -> str:
- """
- Create bucket name based on redaction status
- """
- if redacted:
- bucket = "cap-redacted"
- else:
- bucket = "cap-unredacted"
- return bucket
diff --git a/capstone/scripts/export.py b/capstone/scripts/export.py
index c1134a3d0..a499ec337 100644
--- a/capstone/scripts/export.py
+++ b/capstone/scripts/export.py
@@ -3,7 +3,6 @@
import tempfile
import zipfile
from io import StringIO
-from collections import namedtuple
from datetime import date
from pathlib import Path
from celery import shared_task
@@ -13,6 +12,7 @@
from django.utils import timezone
from capapi.documents import CaseDocument
+from capapi.resources import call_serializer
from capapi.serializers import NoLoginCaseDocumentSerializer, CaseDocumentSerializer
from capdb.models import Jurisdiction, Reporter
from capdb.storages import download_files_storage
@@ -155,12 +155,6 @@ def export_case_documents(cases, zip_path, filter_item, public=False):
"Bagging-Date: %s\n"
) % (filter_item, timezone.now().strftime("%Y-%m-%d"))
- # fake Request object used for serializing cases with DRF's serializer
- vars['fake_request'] = namedtuple('Request', ['query_params', 'accepted_renderer'])(
- query_params=vars['query_params'],
- accepted_renderer=None,
- )
-
# create new zip file in memory
vars['out_spool'] = tempfile.TemporaryFile()
vars['archive'] = zipfile.ZipFile(vars['out_spool'], 'w', zipfile.ZIP_STORED)
@@ -171,8 +165,8 @@ def export_case_documents(cases, zip_path, filter_item, public=False):
# write each case
for item in cases.scan():
for format_name, vars in formats.items():
- serializer = vars['serializer'](item['_source'], context={'request': vars['fake_request']})
- vars['compressed_data_file'].write(bytes(json.dumps(serializer.data), 'utf8') + b'\n')
+ data = call_serializer(vars['serializer'], item['_source'], vars['query_params'])
+ vars['compressed_data_file'].write(bytes(json.dumps(data), 'utf8') + b'\n')
# finish bag for each format
for format_name, vars in formats.items():
diff --git a/capstone/scripts/export_cap_static.py b/capstone/scripts/export_cap_static.py
new file mode 100644
index 000000000..fbb436188
--- /dev/null
+++ b/capstone/scripts/export_cap_static.py
@@ -0,0 +1,334 @@
+import shutil
+import tempfile
+from pathlib import Path
+
+import boto3
+import json
+from botocore.exceptions import ClientError
+from celery import shared_task
+from django.conf import settings
+from django.db import transaction
+from tqdm import tqdm
+
+from capapi.documents import CaseDocument
+from capapi.resources import call_serializer
+from capapi.serializers import VolumeSerializer, NoLoginCaseDocumentSerializer, ReporterSerializer
+from capdb.models import Reporter, VolumeMetadata, Jurisdiction
+from scripts.update_snippets import get_map_numbers
+
+
+# steps:
+# - export volumes: fab export_cap_static_cases calls export_cases_by_volume()
+# - export reporter metadata: fab summarize_cap_static calls finalize_reporters()
+# - (not in codebase yet) copy PDFs and captars from one part of S3 to another
+
+
+def finalize_reporters(dest_dir: str) -> None:
+ """
+ """
+ dest_dir = Path(dest_dir)
+ for sub_dir in ("redacted", "unredacted"):
+ if (dest_dir / sub_dir).exists():
+ finalize_reporters_dir(dest_dir / sub_dir)
+
+def finalize_reporters_dir(dest_dir: Path) -> None:
+
+ # write missing reporter metadata
+ print("Writing missing reporter metadata")
+ all_volumes = []
+ for reporter_dir in tqdm(dest_dir.iterdir()):
+ if not reporter_dir.is_dir():
+ continue
+ reporter_metadata_path = reporter_dir / "ReporterMetadata.json"
+ if reporter_metadata_path.exists():
+ continue
+
+ # fetch reporter object
+ if reporter_dir.name in reporter_slug_dict_reverse:
+ reporter = Reporter.objects.get(pk=reporter_slug_dict_reverse[reporter_dir.name])
+ else:
+ reporter = Reporter.objects.get(short_name_slug=reporter_dir.name)
+
+ # export reporter metadata
+ reporter_dict = call_serializer(ReporterSerializer, reporter)
+ reporter_dict["harvard_hollis_id"] = reporter.hollis
+ reporter_dict["slug"] = reporter_dir.name
+ remove_keys(reporter_dict, ["url", "frontend_url", ("jurisdictions", ["slug", "whitelisted", "url"])])
+ write_json(reporter_metadata_path, reporter_dict)
+
+ # write reporter-level VolumesMetadata.json
+ print("Writing VolumesMetadata.json")
+ volumes_metadata = [json.loads(f.read_text()) for f in reporter_dir.glob("*/VolumeMetadata.json")]
+ write_json(reporter_dir / "VolumesMetadata.json", volumes_metadata)
+ all_volumes.extend(volumes_metadata)
+
+ # write ReportersMetadata.json
+ print("Writing ReportersMetadata.json")
+ reporters_metadata = [json.loads(f.read_text()) for f in dest_dir.glob("*/ReporterMetadata.json")]
+ write_json(dest_dir / "ReportersMetadata.json", reporters_metadata)
+
+ # write JurisdictionsMetadata.json
+ # this is the same data as ReportersMetadata.json, but with a list of reporters for each jurisdiction
+ # instead of a list of jurisdictions for each reporter
+ print("Writing JurisdictionsMetadata.json")
+ jurisdictions = {}
+ jurisdiction_counts = get_map_numbers()
+ for jurisdiction in Jurisdiction.objects.all():
+ if jurisdiction.slug not in jurisdiction_counts:
+ continue
+ jurisdictions[jurisdiction.id] = {
+ "id": jurisdiction.pk,
+ "slug": jurisdiction.slug,
+ "name": jurisdiction.name,
+ "name_long": jurisdiction.name_long,
+ **jurisdiction_counts[jurisdiction.slug],
+ "reporters": [],
+ }
+
+ for reporter in reporters_metadata:
+ reporter_jurisdictions = reporter.pop("jurisdictions")
+ for jurisdiction in reporter_jurisdictions:
+ jurisdictions[jurisdiction["id"]]["reporters"].append(reporter)
+
+ jurisdictions = [j for j in sorted(jurisdictions.values(), key=lambda j: j["name_long"])]
+ write_json(dest_dir / "JurisdictionsMetadata.json", jurisdictions)
+
+ # write top-level VolumesMetadata.json
+ print("Writing VolumesMetadata.json")
+ write_json(dest_dir / "VolumesMetadata.json", all_volumes)
+
+
+@shared_task
+def export_cases_by_volume(volume: str, dest_dir: str) -> None:
+ volume = VolumeMetadata.objects.select_related("reporter").get(pk=volume)
+ dest_dir = Path(dest_dir)
+ export_volume(volume, dest_dir / "redacted")
+
+ # export unredacted version of redacted volumes
+ if settings.REDACTION_KEY and volume.redacted:
+ # use a transaction to temporarily unredact the volume, then roll back
+ with transaction.atomic('capdb'):
+ volume.unredact(replace_pdf=False)
+ export_volume(volume, dest_dir / "unredacted")
+ transaction.set_rollback(True, using='capdb')
+
+def export_volume(volume: VolumeMetadata, dest_dir: Path) -> None:
+ """
+ Write a .json file for each case per volume.
+ Write an .html file for each case per volume.
+ Write a .json file with all case metadata per volume.
+ Write a .json file with all volume metadata for this collection.
+ """
+
+ # set up vars
+ print("Exporting volume", volume.get_frontend_url())
+ reporter_prefix = reporter_slug_dict.get(volume.reporter_id, volume.reporter.short_name_slug)
+ volume_dir = dest_dir / reporter_prefix / volume.volume_number
+
+ # don't overwrite existing volumes
+ if volume_dir.exists():
+ return
+
+ # find cases to write
+ cases = list(volume.case_metadatas.filter(in_scope=True).for_indexing().order_by('case_id'))
+ if not cases:
+ print(f"WARNING: Volume '{volume.barcode}' contains NO CASES.")
+ return
+
+ # set up temp volume dir
+ temp_dir = tempfile.TemporaryDirectory()
+ temp_volume_dir = Path(temp_dir.name)
+ cases_dir = temp_volume_dir / "cases"
+ cases_dir.mkdir()
+ html_dir = temp_volume_dir / "html"
+ html_dir.mkdir()
+ volume_metadata = volume_to_dict(volume)
+ write_json(temp_volume_dir / "VolumeMetadata.json", volume_metadata)
+
+ # variables for case export loop
+ case_file_name_index = 1
+ prev_case_first_page = None
+ case_metadatas = []
+ case_doc = CaseDocument()
+
+ # store the serialized case data
+ for case in cases:
+ # convert case model to search index format
+ search_item = case_doc.prepare(case)
+ search_item['last_updated'] = search_item['last_updated'].isoformat()
+ search_item['decision_date'] = search_item['decision_date'].isoformat()
+
+ # convert search index format to API format
+ case_data = call_serializer(NoLoginCaseDocumentSerializer, search_item, {"body_format": "text"})
+
+ # update case_data to match our output format:
+ if "casebody" in case_data:
+ case_data["casebody"] = case_data["casebody"]["data"]
+ case_data["first_page_order"] = case.first_page_order
+ case_data["last_page_order"] = case.last_page_order
+ remove_keys(case_data, [
+ "reporter",
+ "volume",
+ "url",
+ "frontend_url",
+ "frontend_pdf_url",
+ "preview",
+ ("court", ["slug", "url"]),
+ ("jurisdiction", ["slug", "whitelisted", "url"]),
+ ])
+ for cite in case_data["cites_to"]:
+ cite["opinion_index"] = cite.pop("opinion_id")
+
+ # calculate casefile name
+ first_page = case_data["first_page"]
+ if prev_case_first_page == first_page:
+ case_file_name_index += 1
+ else:
+ case_file_name_index = 1
+ prev_case_first_page = first_page
+ case_file_name = f"{first_page:0>4}-{case_file_name_index:0>2}.json"
+
+ # write casefile
+ write_json(cases_dir / case_file_name, case_data)
+
+ # write metadata without 'casebody'
+ case_data.pop("casebody", None)
+ case_metadatas.append(case_data)
+
+ # write html file
+ html_file_path = (html_dir / case_file_name).with_suffix(".html")
+ html_file_path.write_text(search_item["casebody_data"]["html"])
+
+ # write metadata file
+ write_json(temp_volume_dir / "CasesMetadata.json", case_metadatas)
+
+ # move to real directory
+ volume_dir.parent.mkdir(exist_ok=True, parents=True)
+ shutil.copytree(temp_volume_dir, volume_dir)
+
+
+def volume_to_dict(volume: VolumeMetadata) -> dict:
+ """
+ Write a .json file with just the single volume metadata.
+ """
+ volume_data = call_serializer(VolumeSerializer, volume)
+
+ # change "barcode" key to "id" key
+ volume_data["id"] = volume_data.pop("barcode", None)
+
+ # add additional fields from model
+ volume_data["harvard_hollis_id"] = volume.hollis_number
+ volume_data["spine_start_year"] = volume.spine_start_year
+ volume_data["spine_end_year"] = volume.spine_end_year
+ volume_data["publication_city"] = volume.publication_city
+ volume_data["second_part_of_id"] = volume.second_part_of_id
+
+ # add information about volume's nominative_reporter
+ if volume.nominative_reporter_id:
+ volume_data["nominative_reporter"] = {
+ "id": volume.nominative_reporter_id,
+ "short_name": volume.nominative_reporter.short_name,
+ "full_name": volume.nominative_reporter.full_name,
+ "volume_number": volume.nominative_volume_number
+ }
+ elif volume.nominative_volume_number or volume.nominative_name:
+ volume_data["nominative_reporter"] = {
+ "volume_number": volume.nominative_volume_number,
+ "nominative_name": volume.nominative_name,
+ }
+ else:
+ volume_data["nominative_reporter"] = None
+
+ # remove unnecessary fields
+ remove_keys(volume_data, [
+ "reporter",
+ "reporter_url",
+ "url",
+ "pdf_url",
+ "frontend_url",
+ "nominative_volume_number",
+ "nominative_name",
+ ("jurisdictions", ["slug", "whitelisted", "url"]),
+ ])
+
+ return volume_data
+
+
+def copy_volume_pdf(
+ volume: object, volume_prefix: str, dest_bucket: str, redacted: bool
+) -> None:
+ """
+ Copy PDF volume from original location to destination bucket
+ """
+ s3_client = boto3.client("s3")
+
+ if redacted:
+ source_prefix = "pdf/redacted"
+ else:
+ source_prefix = "pdf/unredacted"
+
+ try:
+ s3_client.head_object(Bucket=dest_bucket, Key=f"{volume_prefix}/Volume.pdf")
+ print(f"{dest_bucket}/{volume_prefix}/Volume.pdf already uploaded!")
+ except ClientError as err:
+ if err.response["Error"]["Code"] == "404":
+ # "With a copy command, the checksum of the object is a direct checksum of the full object."
+ # https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html
+ copy_source = {
+ "Bucket": "harvard-cap-archive",
+ "Key": f"{source_prefix}/{volume.barcode}.pdf",
+ }
+ copy_object_params = {
+ "Bucket": dest_bucket,
+ "Key": f"{volume_prefix}/Volume.pdf",
+ "CopySource": copy_source,
+ }
+
+ s3_client.copy_object(**copy_object_params)
+ print(
+ f"Copied {source_prefix}/{volume.barcode}.pdf to \
+ {volume_prefix}/Volume.pdf"
+ )
+ else:
+ raise Exception(
+ f"Cannot upload {source_prefix}/{volume.barcode}.pdf to \
+ {volume_prefix}/Volume.pdf: %s"
+ % err
+ )
+
+
+
+### helpers ###
+
+# Some reporters share a slug, so we have to differentiate with ids
+reporter_slug_dict = {
+ 415: "us-ct-cl",
+ 657: "wv-ct-cl",
+ 580: "mass-app-div-annual",
+ 576: "mass-app-div",
+}
+reporter_slug_dict_reverse = {v: k for k, v in reporter_slug_dict.items()}
+
+def remove_keys(results: dict, keys: list) -> dict:
+ """
+ Remove keys from results dict
+ """
+ for key in keys:
+ if type(key) is tuple:
+ key, subkeys = key
+ if key in results:
+ value = results[key]
+ if type(value) is list:
+ for subvalue in value:
+ remove_keys(subvalue, subkeys)
+ else:
+ remove_keys(value, subkeys)
+ else:
+ results.pop(key, None)
+ return results
+
+def write_json(path: Path, contents) -> None:
+ """
+ Write contents to path
+ """
+ path.write_text(json.dumps(contents, indent=2) + "\n")
diff --git a/capstone/scripts/tests/test_cap_static.py b/capstone/scripts/tests/test_cap_static.py
new file mode 100644
index 000000000..e90feed2e
--- /dev/null
+++ b/capstone/scripts/tests/test_cap_static.py
@@ -0,0 +1,48 @@
+import shutil
+from pathlib import Path
+
+import pytest
+from django.conf import settings
+
+from capdb.models import VolumeMetadata
+from fabfile import export_cap_static_cases, summarize_cap_static, update_elasticsearch_from_queue
+from test_data.test_fixtures.helpers import check_path
+
+
+@pytest.mark.django_db(databases=['capdb'])
+def test_export_cap_static(case_factory, jurisdiction_factory, redacted_case_factory, volume_metadata_factory, reporter_factory, tmp_path, pytestconfig, elasticsearch, django_assert_num_queries):
+ # set up a reporter with two volumes, each with three cases
+ jurisdiction = jurisdiction_factory(name_long="United States", name="U.S.", slug='us')
+ jurisdiction2 = jurisdiction_factory(name_long="Massachusetts", name="Mass.", slug='mass')
+ reporter = reporter_factory(full_name="United States Reports", short_name="U.S.", short_name_slug='us')
+ reporter.jurisdictions.set([jurisdiction, jurisdiction2])
+ volumes = [volume_metadata_factory(volume_number=volume_number, reporter=reporter, redacted=True) for volume_number in ("1", "2")]
+ for volume in volumes:
+ case_factory(volume=volume, first_page="1", reporter=reporter, jurisdiction=jurisdiction)
+ case_factory(volume=volume, first_page="2", reporter=reporter, jurisdiction=jurisdiction)
+ redacted_case_factory(volume=volume, first_page="2", reporter=reporter, jurisdiction=jurisdiction2)
+ # for some reason case_factory is creating extra volumes, so delete those
+ VolumeMetadata.objects.exclude(pk__in=[v.pk for v in volumes]).update(out_of_scope=True)
+ update_elasticsearch_from_queue()
+
+ # run export to temp dir
+ with django_assert_num_queries(select=37, update=8, insert=2, delete=2, rollback=2):
+ export_cap_static_cases(dest_dir=str(tmp_path))
+ with django_assert_num_queries(select=8):
+ summarize_cap_static(str(tmp_path))
+
+ # compare temp dir to test_data/cap_static
+ cap_static_dir = Path(settings.BASE_DIR, 'test_data/cap_static')
+ if pytestconfig.getoption('recreate_files'):
+ # if --recreate-files was passed, copy temp dir to test_data/cap_static instead of checking
+ if cap_static_dir.exists():
+ shutil.rmtree(cap_static_dir)
+ shutil.copytree(tmp_path, cap_static_dir)
+ else:
+ cap_static_paths = [p.relative_to(cap_static_dir) for p in cap_static_dir.rglob('*')]
+ tmp_paths = [p.relative_to(tmp_path) for p in tmp_path.rglob('*')]
+ assert cap_static_paths == tmp_paths, "Missing or extra files in cap_static export."
+ for path in tmp_path.rglob('*'):
+ if not path.is_file():
+ continue
+ check_path(pytestconfig, path, cap_static_dir / path.relative_to(tmp_path))
diff --git a/capstone/scripts/tests/test_fastcase.py b/capstone/scripts/tests/test_fastcase.py
index 78deefcf2..2c2f3d831 100644
--- a/capstone/scripts/tests/test_fastcase.py
+++ b/capstone/scripts/tests/test_fastcase.py
@@ -13,7 +13,7 @@
from fabfile import refresh_case_body_cache
from scripts.fastcase import ingest_fastcase
from scripts.fastcase.format_fastcase import segment_paragraphs
-from test_data.test_fixtures.helpers import sort_nested_dict
+from test_data.test_fixtures.helpers import sort_nested_dict, check_path
@pytest.mark.parametrize("input,expected", [
@@ -66,19 +66,6 @@ def test_fastcase_ingest(tmp_path, pytestconfig, elasticsearch):
copy_tree(str(fastcase_dir), str(tmp_path))
management.call_command('loaddata', 'capdb/fixtures/jurisdiction.capdb.json.gz', 'capdb/fixtures/reporter.capdb.json.gz', database='capdb')
- # helper to check whether files have changed
- def check_path(new_contents, saved_path):
- if isinstance(new_contents, Path):
- new_contents = new_contents.read_text()
- old_contents = saved_path.read_text() if saved_path.exists() else ''
- if new_contents != old_contents:
- if pytestconfig.getoption('recreate_fastcase_files'):
- saved_path.write_text(new_contents)
- elif old_contents:
- assert new_contents == old_contents, f"File {saved_path} has changed. Run pytest -k test_fastcase_ingest --recreate_fastcase_files to update."
- else:
- assert False, f"File {saved_path} does not exist. Run pytest -k test_fastcase_ingest --recreate_fastcase_files to update."
-
# run the ingest
ingest_fastcase.pack_volumes(tmp_path, recreate=True)
ingest_fastcase.main(batch='test_batch', base_dir=tmp_path)
@@ -104,10 +91,11 @@ def check_path(new_contents, saved_path):
case_data['has_body_cache'] = bool(case.body_cache)
cases[case.case_id] = case_data
# check case html for changes
- check_path(case.body_cache.html, fastcase_dir.joinpath(case.fastcase_import.path).with_suffix('.html'))
+ check_path(pytestconfig, case.body_cache.html,
+ fastcase_dir.joinpath(case.fastcase_import.path).with_suffix('.html'))
volume_data['cases'] = cases
data[volume.pk] = volume_data
data = sort_nested_dict(data)
# check metadata files for changes
- check_path(yaml.dump(data), fastcase_dir / 'data.yml')
+ check_path(pytestconfig, yaml.dump(data), fastcase_dir / 'data.yml')
diff --git a/capstone/scripts/update_snippets.py b/capstone/scripts/update_snippets.py
index 4e0ba8643..f5ca6dc8b 100644
--- a/capstone/scripts/update_snippets.py
+++ b/capstone/scripts/update_snippets.py
@@ -112,6 +112,11 @@ def update_map_numbers():
""" Write map_numbers snippet. """
label = "map_numbers"
snippet_format = "application/json"
+ output = get_map_numbers()
+ write_update(label, snippet_format, json.dumps(output))
+
+
+def get_map_numbers():
cursor = connections['capdb'].cursor()
cursor.execute(r"""
SELECT
@@ -121,16 +126,19 @@ def update_map_numbers():
COUNT(DISTINCT c.reporter_id) AS reporter_count,
SUM(CASE WHEN (c.first_page||c.last_page)~E'^\\d+$' THEN c.last_page::integer-c.first_page::integer+1 ELSE 1 END) AS page_count
FROM capdb_jurisdiction j
- LEFT JOIN capdb_casemetadata c ON j.id=c.jurisdiction_id
+ JOIN capdb_casemetadata c ON j.id=c.jurisdiction_id
+ JOIN capdb_volumemetadata v ON c.volume_id=v.barcode
WHERE
c.in_scope IS True
+ AND v.out_of_scope IS False
GROUP BY j.id;
""")
# get column names from sql query
cols = [col[0] for col in cursor.description]
# create output where each key is a jurisdiction and each value is a dict of values from the sql query
output = {row[0]: dict(zip(cols[1:], row[1:])) for row in cursor.fetchall()}
- write_update(label, snippet_format, json.dumps(output))
+ return output
+
def search_jurisdiction_list():
jurisdictions = [ (jurisdiction.slug, jurisdiction.name_long)
diff --git a/capstone/test_data/cap_static/redacted/JurisdictionsMetadata.json b/capstone/test_data/cap_static/redacted/JurisdictionsMetadata.json
new file mode 100644
index 000000000..459e5a1ac
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/JurisdictionsMetadata.json
@@ -0,0 +1,44 @@
+[
+ {
+ "id": 2,
+ "slug": "mass",
+ "name": "Mass.",
+ "name_long": "Massachusetts",
+ "case_count": 2,
+ "volume_count": 2,
+ "reporter_count": 1,
+ "page_count": 10,
+ "reporters": [
+ {
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "harvard_hollis_id": [],
+ "slug": "us"
+ }
+ ]
+ },
+ {
+ "id": 1,
+ "slug": "us",
+ "name": "U.S.",
+ "name_long": "United States",
+ "case_count": 4,
+ "volume_count": 2,
+ "reporter_count": 1,
+ "page_count": 20,
+ "reporters": [
+ {
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "harvard_hollis_id": [],
+ "slug": "us"
+ }
+ ]
+ }
+]
diff --git a/capstone/test_data/cap_static/redacted/ReportersMetadata.json b/capstone/test_data/cap_static/redacted/ReportersMetadata.json
new file mode 100644
index 000000000..d166b62ca
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/ReportersMetadata.json
@@ -0,0 +1,23 @@
+[
+ {
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "harvard_hollis_id": [],
+ "slug": "us"
+ }
+]
diff --git a/capstone/test_data/cap_static/redacted/VolumesMetadata.json b/capstone/test_data/cap_static/redacted/VolumesMetadata.json
new file mode 100644
index 000000000..72c2be61a
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/VolumesMetadata.json
@@ -0,0 +1,60 @@
+[
+ [
+ {
+ "volume_number": "1",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "4909170303750",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ },
+ {
+ "volume_number": "2",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "5012832128833",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ }
+ ]
+]
diff --git a/capstone/test_data/cap_static/redacted/us/1/CasesMetadata.json b/capstone/test_data/cap_static/redacted/us/1/CasesMetadata.json
new file mode 100644
index 000000000..fb954bb3f
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/CasesMetadata.json
@@ -0,0 +1,136 @@
+[
+ {
+ "id": 1,
+ "name": "First Foo0 versus First Bar0",
+ "name_abbreviation": "Foo0 v. Bar0",
+ "decision_date": "1900-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "28 U.S. 347"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Sound spend.",
+ "id": 1,
+ "name": "Father worry common past recognize."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "362 U.S. 816",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 3210483407,
+ "random_bucket": 5839
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+ },
+ {
+ "id": 2,
+ "name": "First Foo1 versus First Bar1",
+ "name_abbreviation": "Foo1 v. Bar1",
+ "decision_date": "1901-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "257 U.S. 222"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Upon.",
+ "id": 2,
+ "name": "Opportunity cup speech."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "934 U.S. 230",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 1950914210,
+ "random_bucket": 38562
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ },
+ {
+ "id": 3,
+ "name": "First Foo2 versus First Bar2",
+ "name_abbreviation": "Foo2 v. Bar2",
+ "decision_date": "1902-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "311 U.S. 951"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Rule six your.",
+ "id": 3,
+ "name": "Little perhaps look many."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 2,
+ "char_count": 14,
+ "ocr_confidence": 0,
+ "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89",
+ "simhash": "1:0000000000000000",
+ "word_count": 2,
+ "random_id": 2863198571,
+ "random_bucket": 61803
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ }
+]
diff --git a/capstone/test_data/cap_static/redacted/us/1/VolumeMetadata.json b/capstone/test_data/cap_static/redacted/us/1/VolumeMetadata.json
new file mode 100644
index 000000000..2f4470a42
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/VolumeMetadata.json
@@ -0,0 +1,28 @@
+{
+ "volume_number": "1",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "4909170303750",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+}
diff --git a/capstone/test_data/cap_static/redacted/us/1/cases/0001-01.json b/capstone/test_data/cap_static/redacted/us/1/cases/0001-01.json
new file mode 100644
index 000000000..d9010b57b
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/cases/0001-01.json
@@ -0,0 +1,66 @@
+{
+ "id": 1,
+ "name": "First Foo0 versus First Bar0",
+ "name_abbreviation": "Foo0 v. Bar0",
+ "decision_date": "1900-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "28 U.S. 347"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Sound spend.",
+ "id": 1,
+ "name": "Father worry common past recognize."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "362 U.S. 816",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 3210483407,
+ "random_bucket": 5839
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [
+ "Fearing, C.J., and Korsmo, J., concur."
+ ],
+ "parties": [
+ "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant."
+ ],
+ "opinions": [
+ {
+ "text": "Opinion text",
+ "type": "majority",
+ "author": "Pennell, J."
+ }
+ ],
+ "attorneys": [
+ "Matthew J. Dudley, for appellant.",
+ "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent."
+ ],
+ "corrections": "",
+ "head_matter": "head matter"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+}
diff --git a/capstone/test_data/cap_static/redacted/us/1/cases/0002-01.json b/capstone/test_data/cap_static/redacted/us/1/cases/0002-01.json
new file mode 100644
index 000000000..099ae6342
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/cases/0002-01.json
@@ -0,0 +1,66 @@
+{
+ "id": 2,
+ "name": "First Foo1 versus First Bar1",
+ "name_abbreviation": "Foo1 v. Bar1",
+ "decision_date": "1901-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "257 U.S. 222"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Upon.",
+ "id": 2,
+ "name": "Opportunity cup speech."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "934 U.S. 230",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 1950914210,
+ "random_bucket": 38562
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [
+ "Fearing, C.J., and Korsmo, J., concur."
+ ],
+ "parties": [
+ "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant."
+ ],
+ "opinions": [
+ {
+ "text": "Opinion text",
+ "type": "majority",
+ "author": "Pennell, J."
+ }
+ ],
+ "attorneys": [
+ "Matthew J. Dudley, for appellant.",
+ "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent."
+ ],
+ "corrections": "",
+ "head_matter": "head matter"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/redacted/us/1/cases/0002-02.json b/capstone/test_data/cap_static/redacted/us/1/cases/0002-02.json
new file mode 100644
index 000000000..3dd2a37bc
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/cases/0002-02.json
@@ -0,0 +1,58 @@
+{
+ "id": 3,
+ "name": "First Foo2 versus First Bar2",
+ "name_abbreviation": "Foo2 v. Bar2",
+ "decision_date": "1902-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "311 U.S. 951"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Rule six your.",
+ "id": 3,
+ "name": "Little perhaps look many."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 2,
+ "char_count": 14,
+ "ocr_confidence": 0,
+ "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89",
+ "simhash": "1:0000000000000000",
+ "word_count": 2,
+ "random_id": 2863198571,
+ "random_bucket": 61803
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [],
+ "opinions": [
+ {
+ "text": "not redacted",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": ""
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/redacted/us/1/html/0001-01.html b/capstone/test_data/cap_static/redacted/us/1/html/0001-01.html
new file mode 100644
index 000000000..5357a97ee
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/html/0001-01.html
@@ -0,0 +1 @@
+Case html 0
\ No newline at end of file
diff --git a/capstone/test_data/cap_static/redacted/us/1/html/0002-01.html b/capstone/test_data/cap_static/redacted/us/1/html/0002-01.html
new file mode 100644
index 000000000..56e046328
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/html/0002-01.html
@@ -0,0 +1 @@
+Case html 1
\ No newline at end of file
diff --git a/capstone/test_data/cap_static/redacted/us/1/html/0002-02.html b/capstone/test_data/cap_static/redacted/us/1/html/0002-02.html
new file mode 100644
index 000000000..13bc3490b
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/1/html/0002-02.html
@@ -0,0 +1,6 @@
+
diff --git a/capstone/test_data/cap_static/redacted/us/2/CasesMetadata.json b/capstone/test_data/cap_static/redacted/us/2/CasesMetadata.json
new file mode 100644
index 000000000..6d1ea2ac7
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/CasesMetadata.json
@@ -0,0 +1,136 @@
+[
+ {
+ "id": 4,
+ "name": "First Foo3 versus First Bar3",
+ "name_abbreviation": "Foo3 v. Bar3",
+ "decision_date": "1903-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "447 U.S. 189"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Quickly walk.",
+ "id": 4,
+ "name": "Focus detail several position."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "524 U.S. 591",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 3673096554,
+ "random_bucket": 362
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+ },
+ {
+ "id": 5,
+ "name": "First Foo4 versus First Bar4",
+ "name_abbreviation": "Foo4 v. Bar4",
+ "decision_date": "1904-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "872 U.S. 266"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Like area.",
+ "id": 5,
+ "name": "Commercial edge agency ground risk."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "596 U.S. 768",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 2595981630,
+ "random_bucket": 35134
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ },
+ {
+ "id": 6,
+ "name": "First Foo5 versus First Bar5",
+ "name_abbreviation": "Foo5 v. Bar5",
+ "decision_date": "1905-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "958 U.S. 5"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Same religious.",
+ "id": 6,
+ "name": "Size fish back degree."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 2,
+ "char_count": 14,
+ "ocr_confidence": 0,
+ "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89",
+ "simhash": "1:0000000000000000",
+ "word_count": 2,
+ "random_id": 3120486931,
+ "random_bucket": 55827
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ }
+]
diff --git a/capstone/test_data/cap_static/redacted/us/2/VolumeMetadata.json b/capstone/test_data/cap_static/redacted/us/2/VolumeMetadata.json
new file mode 100644
index 000000000..34bd2f2f0
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/VolumeMetadata.json
@@ -0,0 +1,28 @@
+{
+ "volume_number": "2",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "5012832128833",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+}
diff --git a/capstone/test_data/cap_static/redacted/us/2/cases/0001-01.json b/capstone/test_data/cap_static/redacted/us/2/cases/0001-01.json
new file mode 100644
index 000000000..f2556140b
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/cases/0001-01.json
@@ -0,0 +1,66 @@
+{
+ "id": 4,
+ "name": "First Foo3 versus First Bar3",
+ "name_abbreviation": "Foo3 v. Bar3",
+ "decision_date": "1903-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "447 U.S. 189"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Quickly walk.",
+ "id": 4,
+ "name": "Focus detail several position."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "524 U.S. 591",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 3673096554,
+ "random_bucket": 362
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [
+ "Fearing, C.J., and Korsmo, J., concur."
+ ],
+ "parties": [
+ "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant."
+ ],
+ "opinions": [
+ {
+ "text": "Opinion text",
+ "type": "majority",
+ "author": "Pennell, J."
+ }
+ ],
+ "attorneys": [
+ "Matthew J. Dudley, for appellant.",
+ "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent."
+ ],
+ "corrections": "",
+ "head_matter": "head matter"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+}
diff --git a/capstone/test_data/cap_static/redacted/us/2/cases/0002-01.json b/capstone/test_data/cap_static/redacted/us/2/cases/0002-01.json
new file mode 100644
index 000000000..db6983ef4
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/cases/0002-01.json
@@ -0,0 +1,66 @@
+{
+ "id": 5,
+ "name": "First Foo4 versus First Bar4",
+ "name_abbreviation": "Foo4 v. Bar4",
+ "decision_date": "1904-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "872 U.S. 266"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Like area.",
+ "id": 5,
+ "name": "Commercial edge agency ground risk."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [
+ {
+ "cite": "596 U.S. 768",
+ "category": null,
+ "reporter": null,
+ "opinion_index": -1
+ }
+ ],
+ "analysis": {
+ "random_id": 2595981630,
+ "random_bucket": 35134
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [
+ "Fearing, C.J., and Korsmo, J., concur."
+ ],
+ "parties": [
+ "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant."
+ ],
+ "opinions": [
+ {
+ "text": "Opinion text",
+ "type": "majority",
+ "author": "Pennell, J."
+ }
+ ],
+ "attorneys": [
+ "Matthew J. Dudley, for appellant.",
+ "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent."
+ ],
+ "corrections": "",
+ "head_matter": "head matter"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/redacted/us/2/cases/0002-02.json b/capstone/test_data/cap_static/redacted/us/2/cases/0002-02.json
new file mode 100644
index 000000000..007da6102
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/cases/0002-02.json
@@ -0,0 +1,58 @@
+{
+ "id": 6,
+ "name": "First Foo5 versus First Bar5",
+ "name_abbreviation": "Foo5 v. Bar5",
+ "decision_date": "1905-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "958 U.S. 5"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Same religious.",
+ "id": 6,
+ "name": "Size fish back degree."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 2,
+ "char_count": 14,
+ "ocr_confidence": 0,
+ "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89",
+ "simhash": "1:0000000000000000",
+ "word_count": 2,
+ "random_id": 3120486931,
+ "random_bucket": 55827
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [],
+ "opinions": [
+ {
+ "text": "not redacted",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": ""
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/redacted/us/2/html/0001-01.html b/capstone/test_data/cap_static/redacted/us/2/html/0001-01.html
new file mode 100644
index 000000000..682bfc16f
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/html/0001-01.html
@@ -0,0 +1 @@
+Case html 3
\ No newline at end of file
diff --git a/capstone/test_data/cap_static/redacted/us/2/html/0002-01.html b/capstone/test_data/cap_static/redacted/us/2/html/0002-01.html
new file mode 100644
index 000000000..75d3446a8
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/html/0002-01.html
@@ -0,0 +1 @@
+Case html 4
\ No newline at end of file
diff --git a/capstone/test_data/cap_static/redacted/us/2/html/0002-02.html b/capstone/test_data/cap_static/redacted/us/2/html/0002-02.html
new file mode 100644
index 000000000..75cea3a52
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/2/html/0002-02.html
@@ -0,0 +1,6 @@
+
diff --git a/capstone/test_data/cap_static/redacted/us/ReporterMetadata.json b/capstone/test_data/cap_static/redacted/us/ReporterMetadata.json
new file mode 100644
index 000000000..85516ff50
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/ReporterMetadata.json
@@ -0,0 +1,21 @@
+{
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "harvard_hollis_id": [],
+ "slug": "us"
+}
diff --git a/capstone/test_data/cap_static/redacted/us/VolumesMetadata.json b/capstone/test_data/cap_static/redacted/us/VolumesMetadata.json
new file mode 100644
index 000000000..1a24582d4
--- /dev/null
+++ b/capstone/test_data/cap_static/redacted/us/VolumesMetadata.json
@@ -0,0 +1,58 @@
+[
+ {
+ "volume_number": "1",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "4909170303750",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ },
+ {
+ "volume_number": "2",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "5012832128833",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ }
+]
diff --git a/capstone/test_data/cap_static/unredacted/JurisdictionsMetadata.json b/capstone/test_data/cap_static/unredacted/JurisdictionsMetadata.json
new file mode 100644
index 000000000..459e5a1ac
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/JurisdictionsMetadata.json
@@ -0,0 +1,44 @@
+[
+ {
+ "id": 2,
+ "slug": "mass",
+ "name": "Mass.",
+ "name_long": "Massachusetts",
+ "case_count": 2,
+ "volume_count": 2,
+ "reporter_count": 1,
+ "page_count": 10,
+ "reporters": [
+ {
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "harvard_hollis_id": [],
+ "slug": "us"
+ }
+ ]
+ },
+ {
+ "id": 1,
+ "slug": "us",
+ "name": "U.S.",
+ "name_long": "United States",
+ "case_count": 4,
+ "volume_count": 2,
+ "reporter_count": 1,
+ "page_count": 20,
+ "reporters": [
+ {
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "harvard_hollis_id": [],
+ "slug": "us"
+ }
+ ]
+ }
+]
diff --git a/capstone/test_data/cap_static/unredacted/ReportersMetadata.json b/capstone/test_data/cap_static/unredacted/ReportersMetadata.json
new file mode 100644
index 000000000..d166b62ca
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/ReportersMetadata.json
@@ -0,0 +1,23 @@
+[
+ {
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "harvard_hollis_id": [],
+ "slug": "us"
+ }
+]
diff --git a/capstone/test_data/cap_static/unredacted/VolumesMetadata.json b/capstone/test_data/cap_static/unredacted/VolumesMetadata.json
new file mode 100644
index 000000000..72c2be61a
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/VolumesMetadata.json
@@ -0,0 +1,60 @@
+[
+ [
+ {
+ "volume_number": "1",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "4909170303750",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ },
+ {
+ "volume_number": "2",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "5012832128833",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ }
+ ]
+]
diff --git a/capstone/test_data/cap_static/unredacted/us/1/CasesMetadata.json b/capstone/test_data/cap_static/unredacted/us/1/CasesMetadata.json
new file mode 100644
index 000000000..f5ef1c8ff
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/CasesMetadata.json
@@ -0,0 +1,134 @@
+[
+ {
+ "id": 1,
+ "name": "First Foo0 versus First Bar0",
+ "name_abbreviation": "Foo0 v. Bar0",
+ "decision_date": "1900-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "28 U.S. 347"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Sound spend.",
+ "id": 1,
+ "name": "Father worry common past recognize."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 3210483407,
+ "random_bucket": 5839
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+ },
+ {
+ "id": 2,
+ "name": "First Foo1 versus First Bar1",
+ "name_abbreviation": "Foo1 v. Bar1",
+ "decision_date": "1901-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "257 U.S. 222"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Upon.",
+ "id": 2,
+ "name": "Opportunity cup speech."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 1950914210,
+ "random_bucket": 38562
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ },
+ {
+ "id": 3,
+ "name": "First Foo2 versus First Bar2",
+ "name_abbreviation": "Foo2 v. Bar2",
+ "decision_date": "1902-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "311 U.S. 951"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Rule six your.",
+ "id": 3,
+ "name": "Little perhaps look many."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 39,
+ "ocr_confidence": 0,
+ "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5",
+ "simhash": "1:03208952f875022c",
+ "word_count": 8,
+ "random_id": 2863198571,
+ "random_bucket": 61803
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ }
+]
diff --git a/capstone/test_data/cap_static/unredacted/us/1/VolumeMetadata.json b/capstone/test_data/cap_static/unredacted/us/1/VolumeMetadata.json
new file mode 100644
index 000000000..2f4470a42
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/VolumeMetadata.json
@@ -0,0 +1,28 @@
+{
+ "volume_number": "1",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "4909170303750",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/1/cases/0001-01.json b/capstone/test_data/cap_static/unredacted/us/1/cases/0001-01.json
new file mode 100644
index 000000000..151f59be7
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/cases/0001-01.json
@@ -0,0 +1,60 @@
+{
+ "id": 1,
+ "name": "First Foo0 versus First Bar0",
+ "name_abbreviation": "Foo0 v. Bar0",
+ "decision_date": "1900-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "28 U.S. 347"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Sound spend.",
+ "id": 1,
+ "name": "Father worry common past recognize."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 3210483407,
+ "random_bucket": 5839
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [
+ "Case text 0"
+ ],
+ "opinions": [
+ {
+ "text": "Case text 1Case text 2\nCase text 3",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": "Case text 0"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/1/cases/0002-01.json b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-01.json
new file mode 100644
index 000000000..e6350e473
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-01.json
@@ -0,0 +1,60 @@
+{
+ "id": 2,
+ "name": "First Foo1 versus First Bar1",
+ "name_abbreviation": "Foo1 v. Bar1",
+ "decision_date": "1901-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "257 U.S. 222"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Upon.",
+ "id": 2,
+ "name": "Opportunity cup speech."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 1950914210,
+ "random_bucket": 38562
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [
+ "Case text 0"
+ ],
+ "opinions": [
+ {
+ "text": "Case text 1Case text 2\nCase text 3",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": "Case text 0"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/1/cases/0002-02.json b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-02.json
new file mode 100644
index 000000000..23937392e
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-02.json
@@ -0,0 +1,60 @@
+{
+ "id": 3,
+ "name": "First Foo2 versus First Bar2",
+ "name_abbreviation": "Foo2 v. Bar2",
+ "decision_date": "1902-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "311 U.S. 951"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Rule six your.",
+ "id": 3,
+ "name": "Little perhaps look many."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 39,
+ "ocr_confidence": 0,
+ "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5",
+ "simhash": "1:03208952f875022c",
+ "word_count": 8,
+ "random_id": 2863198571,
+ "random_bucket": 61803
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [
+ "Text 1"
+ ],
+ "opinions": [
+ {
+ "text": "Text 2Text 3not redacted\nText 4",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": "Text 1"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/1/html/0001-01.html b/capstone/test_data/cap_static/unredacted/us/1/html/0001-01.html
new file mode 100644
index 000000000..46b815f9c
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/html/0001-01.html
@@ -0,0 +1,12 @@
+
+
+
+ Case text 1Case text 2
+
+
+
diff --git a/capstone/test_data/cap_static/unredacted/us/1/html/0002-01.html b/capstone/test_data/cap_static/unredacted/us/1/html/0002-01.html
new file mode 100644
index 000000000..bbefaa490
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/html/0002-01.html
@@ -0,0 +1,12 @@
+
+
+
+ Case text 1Case text 2
+
+
+
diff --git a/capstone/test_data/cap_static/unredacted/us/1/html/0002-02.html b/capstone/test_data/cap_static/unredacted/us/1/html/0002-02.html
new file mode 100644
index 000000000..9d537bcb2
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/1/html/0002-02.html
@@ -0,0 +1,15 @@
+
+
+
+ Text 2Text 3not redacted
+
+
+
+
+
+
diff --git a/capstone/test_data/cap_static/unredacted/us/2/CasesMetadata.json b/capstone/test_data/cap_static/unredacted/us/2/CasesMetadata.json
new file mode 100644
index 000000000..29ed827d1
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/CasesMetadata.json
@@ -0,0 +1,134 @@
+[
+ {
+ "id": 4,
+ "name": "First Foo3 versus First Bar3",
+ "name_abbreviation": "Foo3 v. Bar3",
+ "decision_date": "1903-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "447 U.S. 189"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Quickly walk.",
+ "id": 4,
+ "name": "Focus detail several position."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 3673096554,
+ "random_bucket": 362
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+ },
+ {
+ "id": 5,
+ "name": "First Foo4 versus First Bar4",
+ "name_abbreviation": "Foo4 v. Bar4",
+ "decision_date": "1904-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "872 U.S. 266"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Like area.",
+ "id": 5,
+ "name": "Commercial edge agency ground risk."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 2595981630,
+ "random_bucket": 35134
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ },
+ {
+ "id": 6,
+ "name": "First Foo5 versus First Bar5",
+ "name_abbreviation": "Foo5 v. Bar5",
+ "decision_date": "1905-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "958 U.S. 5"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Same religious.",
+ "id": 6,
+ "name": "Size fish back degree."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 39,
+ "ocr_confidence": 0,
+ "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5",
+ "simhash": "1:03208952f875022c",
+ "word_count": 8,
+ "random_id": 3120486931,
+ "random_bucket": 55827
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+ }
+]
diff --git a/capstone/test_data/cap_static/unredacted/us/2/VolumeMetadata.json b/capstone/test_data/cap_static/unredacted/us/2/VolumeMetadata.json
new file mode 100644
index 000000000..34bd2f2f0
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/VolumeMetadata.json
@@ -0,0 +1,28 @@
+{
+ "volume_number": "2",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "5012832128833",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/2/cases/0001-01.json b/capstone/test_data/cap_static/unredacted/us/2/cases/0001-01.json
new file mode 100644
index 000000000..847039168
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/cases/0001-01.json
@@ -0,0 +1,60 @@
+{
+ "id": 4,
+ "name": "First Foo3 versus First Bar3",
+ "name_abbreviation": "Foo3 v. Bar3",
+ "decision_date": "1903-01-01",
+ "docket_number": "",
+ "first_page": "1",
+ "last_page": "5",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "447 U.S. 189"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Quickly walk.",
+ "id": 4,
+ "name": "Focus detail several position."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 3673096554,
+ "random_bucket": 362
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [
+ "Case text 0"
+ ],
+ "opinions": [
+ {
+ "text": "Case text 1Case text 2\nCase text 3",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": "Case text 0"
+ },
+ "first_page_order": 3,
+ "last_page_order": 7
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/2/cases/0002-01.json b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-01.json
new file mode 100644
index 000000000..5ec8f1fc9
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-01.json
@@ -0,0 +1,60 @@
+{
+ "id": 5,
+ "name": "First Foo4 versus First Bar4",
+ "name_abbreviation": "Foo4 v. Bar4",
+ "decision_date": "1904-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "872 U.S. 266"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Like area.",
+ "id": 5,
+ "name": "Commercial edge agency ground risk."
+ },
+ "jurisdiction": {
+ "id": 1,
+ "name_long": "United States",
+ "name": "U.S."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 47,
+ "ocr_confidence": 1.0,
+ "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc",
+ "simhash": "1:6e45862a08eb1d4c",
+ "word_count": 11,
+ "random_id": 2595981630,
+ "random_bucket": 35134
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [
+ "Case text 0"
+ ],
+ "opinions": [
+ {
+ "text": "Case text 1Case text 2\nCase text 3",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": "Case text 0"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/2/cases/0002-02.json b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-02.json
new file mode 100644
index 000000000..68917d4e2
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-02.json
@@ -0,0 +1,60 @@
+{
+ "id": 6,
+ "name": "First Foo5 versus First Bar5",
+ "name_abbreviation": "Foo5 v. Bar5",
+ "decision_date": "1905-01-01",
+ "docket_number": "",
+ "first_page": "2",
+ "last_page": "6",
+ "citations": [
+ {
+ "type": "official",
+ "cite": "958 U.S. 5"
+ }
+ ],
+ "court": {
+ "name_abbreviation": "Same religious.",
+ "id": 6,
+ "name": "Size fish back degree."
+ },
+ "jurisdiction": {
+ "id": 2,
+ "name_long": "Massachusetts",
+ "name": "Mass."
+ },
+ "cites_to": [],
+ "analysis": {
+ "cardinality": 6,
+ "char_count": 39,
+ "ocr_confidence": 0,
+ "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5",
+ "simhash": "1:03208952f875022c",
+ "word_count": 8,
+ "random_id": 3120486931,
+ "random_bucket": 55827
+ },
+ "last_updated": "2023-12-04T18:17:29.088002+00:00",
+ "provenance": {
+ "date_added": "2023-12-04",
+ "source": "Harvard",
+ "batch": "2018"
+ },
+ "casebody": {
+ "judges": [],
+ "parties": [
+ "Text 1"
+ ],
+ "opinions": [
+ {
+ "text": "Text 2Text 3not redacted\nText 4",
+ "type": "majority",
+ "author": null
+ }
+ ],
+ "attorneys": [],
+ "corrections": "",
+ "head_matter": "Text 1"
+ },
+ "first_page_order": 4,
+ "last_page_order": 8
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/2/html/0001-01.html b/capstone/test_data/cap_static/unredacted/us/2/html/0001-01.html
new file mode 100644
index 000000000..0449b9be6
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/html/0001-01.html
@@ -0,0 +1,12 @@
+
+
+
+ Case text 1Case text 2
+
+
+
diff --git a/capstone/test_data/cap_static/unredacted/us/2/html/0002-01.html b/capstone/test_data/cap_static/unredacted/us/2/html/0002-01.html
new file mode 100644
index 000000000..edcdd9fac
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/html/0002-01.html
@@ -0,0 +1,12 @@
+
+
+
+ Case text 1Case text 2
+
+
+
diff --git a/capstone/test_data/cap_static/unredacted/us/2/html/0002-02.html b/capstone/test_data/cap_static/unredacted/us/2/html/0002-02.html
new file mode 100644
index 000000000..accac1425
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/2/html/0002-02.html
@@ -0,0 +1,15 @@
+
+
+
+ Text 2Text 3not redacted
+
+
+
+
+
+
diff --git a/capstone/test_data/cap_static/unredacted/us/ReporterMetadata.json b/capstone/test_data/cap_static/unredacted/us/ReporterMetadata.json
new file mode 100644
index 000000000..85516ff50
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/ReporterMetadata.json
@@ -0,0 +1,21 @@
+{
+ "id": 1,
+ "full_name": "United States Reports",
+ "short_name": "U.S.",
+ "start_year": 1900,
+ "end_year": 2000,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "harvard_hollis_id": [],
+ "slug": "us"
+}
diff --git a/capstone/test_data/cap_static/unredacted/us/VolumesMetadata.json b/capstone/test_data/cap_static/unredacted/us/VolumesMetadata.json
new file mode 100644
index 000000000..1a24582d4
--- /dev/null
+++ b/capstone/test_data/cap_static/unredacted/us/VolumesMetadata.json
@@ -0,0 +1,58 @@
+[
+ {
+ "volume_number": "1",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "4909170303750",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ },
+ {
+ "volume_number": "2",
+ "title": null,
+ "publisher": null,
+ "publication_year": null,
+ "start_year": null,
+ "end_year": null,
+ "series_volume_number": null,
+ "jurisdictions": [
+ {
+ "id": 2,
+ "name": "Mass.",
+ "name_long": "Massachusetts"
+ },
+ {
+ "id": 1,
+ "name": "U.S.",
+ "name_long": "United States"
+ }
+ ],
+ "id": "5012832128833",
+ "harvard_hollis_id": null,
+ "spine_start_year": null,
+ "spine_end_year": null,
+ "publication_city": null,
+ "second_part_of_id": null,
+ "nominative_reporter": null
+ }
+]
diff --git a/capstone/test_data/test_fixtures/fixtures.py b/capstone/test_data/test_fixtures/fixtures.py
index 9ee4a6954..7b7f37805 100644
--- a/capstone/test_data/test_fixtures/fixtures.py
+++ b/capstone/test_data/test_fixtures/fixtures.py
@@ -33,7 +33,7 @@
### Pytest setup ###
def pytest_addoption(parser):
- parser.addoption("--recreate_fastcase_files", action="store_true", default=False, help="Recreate files in test_data/fastcase rather than testing existing files")
+ parser.addoption("--recreate_files", action="store_true", default=False, help="Recreate files in test_data/ rather than testing existing files")
### Database setup ###
@@ -410,3 +410,101 @@ def urls(live_server):
@pytest.fixture
def map_data():
management.call_command('loaddata', ('jurisdiction', 'reporter', 'snippet'), database='capdb')
+
+
+@pytest.fixture
+def redacted_case_factory(case_factory):
+ def factory(**kwargs):
+ # set up a redacted case
+ case = case_factory(volume__redacted=True, volume__pdf_file="redacted_volume.pdf", **kwargs)
+ structure = case.structure
+ page = structure.pages.first()
+ structure.opinions = [
+ # redacted paragraph
+ {
+ "type": "head",
+ "paragraphs": [
+ {
+ "class": "parties",
+ "block_ids": ["BL_1.1"],
+ "id": "b1-1",
+ "redacted": True,
+ }
+ ],
+ },
+ {
+ "type": "majority",
+ "paragraphs": [
+ # redacted content blocks
+ {
+ "class": "p",
+ "block_ids": ["BL_1.2", "BL_1.3"],
+ "id": "b1-2",
+ },
+ # redacted image block
+ {
+ "class": "image",
+ "block_ids": ["BL_1.4"],
+ "id": "b1-3",
+ },
+ ],
+ # redacted footnote
+ "footnotes": [
+ {
+ # redacted footnote paragraph
+ "paragraphs": [
+ {
+ "class": "p",
+ "block_ids": ["BL_1.5"],
+ "id": "b1-4",
+ }
+ ],
+ "label": "1",
+ "id": "footnote_1_1",
+ "redacted": True,
+ }
+ ],
+ },
+ ]
+ structure.save()
+ page.blocks = [
+ {
+ "id": "BL_1.1",
+ "class": "p",
+ "tokens": ["Text 1"],
+ "rect": [25, 11, 300, 490],
+ },
+ {
+ "id": "BL_1.2",
+ "class": "p",
+ "tokens": ["Text 2"],
+ "redacted": True,
+ "rect": [4, 32, 100, 100],
+ },
+ {
+ "id": "BL_1.3",
+ "class": "p",
+ "tokens": [["redact"], "Text 3", ["/redact"], "not redacted"],
+ "rect": [225, 11, 430, 290],
+ },
+ {
+ "id": "BL_1.4",
+ "format": "image",
+ "redacted": True,
+ "class": "image",
+ "data": "image data",
+ "rect": [0, 0, 100, 100],
+ },
+ {
+ "id": "BL_1.5",
+ "class": "p",
+ "tokens": ["Text 4"],
+ "rect": [190, 312, 330, 490],
+ },
+ ]
+ page.encrypt()
+ page.save()
+ case.sync_case_body_cache()
+ case.refresh_from_db()
+ return case
+ return factory
diff --git a/capstone/test_data/test_fixtures/helpers.py b/capstone/test_data/test_fixtures/helpers.py
index d9d1327ff..0ef0448e9 100644
--- a/capstone/test_data/test_fixtures/helpers.py
+++ b/capstone/test_data/test_fixtures/helpers.py
@@ -1,5 +1,6 @@
import difflib
import hashlib
+import os
from pathlib import Path
from scripts.helpers import parse_xml, parse_html
@@ -114,3 +115,24 @@ def sort_nested_dict(d):
if isinstance(d, (list, tuple)):
return [sort_nested_dict(v) for v in d]
return d
+
+
+def current_test_name():
+ return os.environ.get('PYTEST_CURRENT_TEST').split(':')[-1].split(' ')[0]
+
+
+def check_path(pytestconfig, new_contents, saved_path):
+ """
+ Either report a diff between new_contents and saved_path, or update saved_path to match new_contents, depending on pytest --recreate_files.
+ """
+ if isinstance(new_contents, Path):
+ new_contents = new_contents.read_text()
+ old_contents = saved_path.read_text() if saved_path.exists() else ''
+ if new_contents != old_contents:
+ if pytestconfig.getoption('recreate_files'):
+ saved_path.parent.mkdir(parents=True, exist_ok=True)
+ saved_path.write_text(new_contents)
+ elif old_contents:
+ assert new_contents == old_contents, f"File {saved_path} has changed. Run pytest -k {current_test_name()} --recreate_files to update."
+ else:
+ assert False, f"File {saved_path} does not exist. Run pytest -k {current_test_name()} --recreate_files to update."