diff --git a/capstone/capapi/resources.py b/capstone/capapi/resources.py index 75685a965..570fe650b 100644 --- a/capstone/capapi/resources.py +++ b/capstone/capapi/resources.py @@ -1,6 +1,6 @@ import hashlib import concurrent.futures -from copy import copy +from copy import copy from functools import reduce import rest_framework.request @@ -13,6 +13,7 @@ from django.db.models import QuerySet from django.http import QueryDict from django.test.utils import CaptureQueriesContext +from django.test.client import RequestFactory from django.utils.functional import SimpleLazyObject from django_hosts import reverse as django_hosts_reverse from elasticsearch import Elasticsearch @@ -269,3 +270,11 @@ def api_request(request, viewset, method, url_kwargs={}, get_params={}): api_request.GET.update(get_params) return viewset.as_view({'get': method})(api_request, **url_kwargs) + + +def call_serializer(Serializer, item, query_params=None): + """ + Make a fake DRF request so we can call a DRF serializer with the expected context. + """ + request = rest_framework.request.Request(RequestFactory().get('/', query_params)) + return Serializer(item, context={'request': request}).data diff --git a/capstone/capapi/serializers.py b/capstone/capapi/serializers.py index 3115802d7..0870154ce 100644 --- a/capstone/capapi/serializers.py +++ b/capstone/capapi/serializers.py @@ -546,49 +546,9 @@ def data(self): return super(DocumentSerializer, self).data -class ConvertCaseDocumentSerializer(CaseDocumentSerializer): - first_page_order = serializers.CharField() - last_page_order = serializers.CharField() - - def to_representation(self, instance): - first_page_order = self.context.get("first_page_order") - last_page_order = self.context.get("last_page_order") - - data = super().to_representation(instance) - - data.pop("reporter", None) - data.pop("volume", None) - data.pop("url", None) - data.pop("frontend_url", None) - data.pop("frontend_pdf_url", None) - try: - data["court"].pop("slug", None) - data["court"].pop("url", None) - except KeyError as err: - print(f"Cannot pop field {err} because 'court' doesn't exist") - try: - data["jurisdiction"].pop("slug", None) - data["jurisdiction"].pop("whitelisted", None) - data["jurisdiction"].pop("url", None) - except KeyError as err: - print(f"Cannot pop field {err} because 'jurisdiction' doesn't exist") - - if "preview" in data: - data.pop("preview") - data["first_page_order"] = first_page_order - data["last_page_order"] = last_page_order - return data - - class ConvertNoLoginCaseDocumentSerializer(CaseDocumentSerializerWithCasebody): - first_page_order = serializers.CharField() - last_page_order = serializers.CharField() - def to_representation(self, instance, check_permissions=False): """Tell get_casebody not to check for case download permissions.""" - first_page_order = self.context.get("first_page_order") - last_page_order = self.context.get("last_page_order") - data = super().to_representation(instance, check_permissions=check_permissions) try: data["casebody"] = data["casebody"]["data"] @@ -615,8 +575,6 @@ def to_representation(self, instance, check_permissions=False): data.pop("preview", None) - data["first_page_order"] = first_page_order - data["last_page_order"] = last_page_order return data @property diff --git a/capstone/capdb/tests/test_models.py b/capstone/capdb/tests/test_models.py index d901c8795..998782fad 100644 --- a/capstone/capdb/tests/test_models.py +++ b/capstone/capdb/tests/test_models.py @@ -93,96 +93,8 @@ def test_volume_save_slug_update(volume_metadata): @pytest.mark.django_db(databases=["capdb"]) -def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path): - # set up a redacted case - case = case_factory(volume__redacted=True, volume__pdf_file="redacted_volume.pdf") - structure = case.structure - page = structure.pages.first() - structure.opinions = [ - # redacted paragraph - { - "type": "head", - "paragraphs": [ - { - "class": "parties", - "block_ids": ["BL_1.1"], - "id": "b1-1", - "redacted": True, - } - ], - }, - { - "type": "majority", - "paragraphs": [ - # redacted content blocks - { - "class": "p", - "block_ids": ["BL_1.2", "BL_1.3"], - "id": "b1-2", - }, - # redacted image block - { - "class": "image", - "block_ids": ["BL_1.4"], - "id": "b1-3", - }, - ], - # redacted footnote - "footnotes": [ - { - # redacted footnote paragraph - "paragraphs": [ - { - "class": "p", - "block_ids": ["BL_1.5"], - "id": "b1-4", - } - ], - "label": "1", - "id": "footnote_1_1", - "redacted": True, - } - ], - }, - ] - structure.save() - page.blocks = [ - { - "id": "BL_1.1", - "class": "p", - "tokens": ["Text 1"], - "rect": [25, 11, 300, 490], - }, - { - "id": "BL_1.2", - "class": "p", - "tokens": ["Text 2"], - "redacted": True, - "rect": [4, 32, 100, 100], - }, - { - "id": "BL_1.3", - "class": "p", - "tokens": [["redact"], "Text 3", ["/redact"]], - "rect": [225, 11, 430, 290], - }, - { - "id": "BL_1.4", - "format": "image", - "redacted": True, - "class": "image", - "data": "image data", - "rect": [0, 0, 100, 100], - }, - { - "id": "BL_1.5", - "class": "p", - "tokens": ["Text 4"], - "rect": [190, 312, 330, 490], - }, - ] - page.encrypt() - page.save() +def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path, redacted_case_factory): + case = redacted_case_factory() # set up volume pdfs volume = case.volume @@ -194,15 +106,16 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path): download_files_storage.save(volume.pdf_file.name, StringIO("redacted")) # verify redacted case contents - case.sync_case_body_cache() - case.refresh_from_db() - assert case.body_cache.text == "\n\n" + assert case.body_cache.text == "\nnot redacted\n" assert xml_equal( case.body_cache.html, - '
\n' - '
\n' - '
\n' - "
", + '
\n' + '
\n' + '
\n' + '

not redacted

\n' + '
\n' + '
\n', ) # unredact @@ -210,7 +123,7 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path): volume.refresh_from_db() case.body_cache.refresh_from_db() assert volume.redacted is False - assert case.body_cache.text == "Text 1\nText 2Text 3\nText 4\n" + assert case.body_cache.text == "Text 1\nText 2Text 3not redacted\nText 4\n" assert html_equal( case.body_cache.html, dedent( @@ -220,7 +133,7 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):

Text 1

-

Text 2Text 3

+

Text 2Text 3not redacted

diff --git a/capstone/config/celery.py b/capstone/config/celery.py index f4df3a723..b27378866 100644 --- a/capstone/config/celery.py +++ b/capstone/config/celery.py @@ -17,7 +17,7 @@ 'scripts.update_snippets', 'scripts.refactor_xml', 'scripts.make_pdf', - 'scripts.convert_s3', + 'scripts.export_cap_static', ]) # Using a string here means the worker doesn't have to serialize diff --git a/capstone/config/settings/settings_base.py b/capstone/config/settings/settings_base.py index 619f5d2c8..1f3b40552 100644 --- a/capstone/config/settings/settings_base.py +++ b/capstone/config/settings/settings_base.py @@ -385,6 +385,10 @@ def immutable_file_test(path, url): }, } CELERY_TIMEZONE = 'UTC' +CELERY_TASK_ROUTES = { + "scripts.export_cap_static.export_cases_by_volume": {"queue": "cap_static"}, +} + ### CAP API settings ### diff --git a/capstone/fabfile.py b/capstone/fabfile.py index 2bc394af5..9d5444db1 100644 --- a/capstone/fabfile.py +++ b/capstone/fabfile.py @@ -7,6 +7,7 @@ import subprocess import sys import tempfile +import traceback from collections import defaultdict from contextlib import contextmanager from datetime import datetime @@ -61,8 +62,8 @@ validate_private_volumes as validate_private_volumes_script, export, update_snippets, + export_cap_static, ) -from scripts import convert_s3 from scripts.helpers import ( copy_file, volume_barcode_from_folder, @@ -298,6 +299,7 @@ def import_web_volumes(): try: import_volume(f.name) except IntegrityError: + traceback.print_exc() print(" - integrity error; volume already imported? skipping") @@ -452,31 +454,30 @@ def retry_export_cases(version_string): @task -def export_cases_to_s3(reporter="528"): +def export_cap_static_cases(dest_dir="/tmp/cap_exports", reporter=None, volume=None, last_run_before=None): """ - Export a version to S3 of all cases' texts and metadata - by reporter and volume. + First step of the static files export process: export cases, one celery task per volume. """ - redacted = True - bucket = convert_s3.get_bucket_name(redacted) - convert_s3.export_cases_to_s3(bucket, redacted, reporter) - - -@task -def export_reporters_to_s3(): - """ - Run export of all reporters and their contents to S3. - """ - convert_s3.put_reporters_on_s3(redacted=True) + print("Scheduling tasks to reindex volumes") + volumes = VolumeMetadata.objects.exclude(out_of_scope=True) + if volume: + volumes = volumes.filter(pk=volume) + if reporter: + volumes = volumes.filter(reporter_id=reporter) + tasks.run_task_for_volumes( + export_cap_static.export_cases_by_volume, + volumes, + last_run_before=last_run_before, + dest_dir=dest_dir, + ) @task -def export_reporters_to_s3_trial(): +def summarize_cap_static(dest_dir="/tmp/cap_exports"): """ - Run export of all reporters and their contents to S3 - for first API page. + Second step of the static files export process: add summary files at the reporter level and top level. """ - convert_s3.put_reporters_on_s3_trial(redacted=True) + export_cap_static.finalize_reporters(dest_dir) @task diff --git a/capstone/scripts/convert_s3.py b/capstone/scripts/convert_s3.py deleted file mode 100644 index 1179595af..000000000 --- a/capstone/scripts/convert_s3.py +++ /dev/null @@ -1,522 +0,0 @@ -import base64 -import boto3 -import hashlib -import json -import requests -from botocore.exceptions import ClientError -from collections import namedtuple -from celery import group, shared_task - -from capapi.documents import CaseDocument -from capapi.serializers import ( - ConvertNoLoginCaseDocumentSerializer, -) -from capdb.models import Reporter, VolumeMetadata - -s3_client = boto3.client("s3") -api_endpoint = "https://api.case.law/v1/" - - -def put_reporters_on_s3_trial(redacted: bool) -> None: - """ - Kicks off the full cascading S3 file creation series - for a subsection of reporters. - """ - # set bucket name for all operations - bucket = get_bucket_name(redacted) - - current_endpoint = f"{api_endpoint}reporters/" - print("Converting files from ", current_endpoint) - response = requests.get(current_endpoint) - results = response.json() - reporters_metadata = "" - all_volumes_metadata = "" - - # write each entry into jsonl - for result in results["results"]: - # for each reporter, kick off cascading export to S3 - reporter_metadata, subset_volumes_metadata = export_cases_to_s3( - bucket, redacted, result["id"] - ) - reporters_metadata += reporter_metadata - all_volumes_metadata += subset_volumes_metadata - - # uploads all reporters metadata to top level - hash_and_upload( - reporters_metadata, - bucket, - "ReportersMetadata.jsonl", - "application/jsonl", - ) - - # uploads all volumes metadata to top level - hash_and_upload( - all_volumes_metadata, - bucket, - "VolumesMetadata.jsonl", - "application/jsonl", - ) - - -def put_reporters_on_s3(redacted: bool) -> None: - """ - Kicks off the full cascading file creation series. - """ - # set bucket name for all operations - bucket = get_bucket_name(redacted) - - current_endpoint = f"{api_endpoint}reporters/" - previous_cursor = None - current_cursor = "" - reporters_metadata = "" - all_volumes_metadata = "" - - while current_endpoint: - print("Converting files from ", current_endpoint) - response = requests.get(current_endpoint) - results = response.json() - - # write each entry into jsonl - for result in results["results"]: - # for each reporter, kick off cascading export to S3 - reporter_metadata, subset_volumes_metadata = export_cases_to_s3( - bucket, redacted, result["id"] - ) - reporters_metadata += reporter_metadata - all_volumes_metadata += subset_volumes_metadata - - # update cursor to access next endpoint - current_cursor = results["next"] - if current_cursor != previous_cursor: - print("Update next to: ", current_cursor) - - previous_cursor = current_cursor - current_endpoint = current_cursor - - # uploads all reporters metadata to top level - hash_and_upload( - reporters_metadata, - bucket, - "ReportersMetadata.jsonl", - "application/jsonl", - ) - - # uploads all volumes metadata to top level - hash_and_upload( - all_volumes_metadata, - bucket, - "VolumesMetadata.jsonl", - "application/jsonl", - ) - - -def export_cases_to_s3(bucket: str, redacted: bool, reporter_id: str) -> tuple: - """ - Write .jsonl file with all cases per reporter. - """ - reporter = Reporter.objects.get(pk=reporter_id) - - # Make sure there are volumes in the reporter - if not reporter.volumes.exclude(out_of_scope=True): - print("WARNING: Reporter '{}' contains NO VOLUMES.".format(reporter.full_name)) - # Returning empty string to have something to append to reporter metadata - return ("", "") - - # Make sure there are cases in the reporter - cases_search = CaseDocument.raw_search().filter("term", reporter__id=reporter.id) - if cases_search.count() == 0: - print("WARNING: Reporter '{}' contains NO CASES.".format(reporter.full_name)) - # Returning empty string to have something to append to reporter metadata - return ("", "") - - # TODO: address reporters that share slug - if reporter_id in reporter_slug_dict: - reporter_prefix = reporter_slug_dict[reporter_id] - else: - reporter_prefix = reporter.short_name_slug - - # upload reporter metadata - reporter_metadata = put_reporter_metadata(bucket, reporter, reporter_prefix) - - # get in-scope volumes with volume numbers in each reporter - subset_volumes_metadata = "" - - job = group( - export_cases_by_volume.s( - volume=volume.barcode, - reporter_prefix=reporter_prefix, - dest_bucket=bucket, - redacted=redacted, - ) - for volume in ( - reporter.volumes.exclude(volume_number=None) - .exclude(volume_number="") - .exclude(out_of_scope=True) - ) - ) - - results = job.apply_async() - - for i in range(3): - try: - for volume_metadata in results.get(): - subset_volumes_metadata += volume_metadata - break - except ClientError as err: - if err.response['Error']['Code'] == 'NoSuchKey': - print(f'NoSuchKey in {reporter_id} on try {i + 1}') - else: - raise - - return (reporter_metadata, subset_volumes_metadata) - - -@shared_task -def export_cases_by_volume( - volume: str, reporter_prefix: str, dest_bucket: str, redacted: bool -) -> str: - """ - Write a .json file for each case per volume. - Write a .jsonl file with all cases' metadata per volume. - Write a .jsonl file with all volume metadata for this collection. - """ - - volume = VolumeMetadata.objects.get(pk=volume) - - case_file_name_index = 1 - prev_case_first_page = None - - vars = { - "serializer": ConvertNoLoginCaseDocumentSerializer, - "query_params": {"body_format": "text"}, - } - - cases = list(volume.case_metadatas.select_related().order_by("case_id")) - - if len(cases) == 0: - print("WARNING: Volume '{}' contains NO CASES.".format(volume.barcode)) - # Returning empty string to have something to append to volume metadata - return "" - - # open each volume and put case text or metadata into file based on format - cases_search = CaseDocument.raw_search().filter( - "term", volume__barcode=volume.barcode - ) - - # create a dictionary to grab data from each CaseDocument search object - cases_search_by_id = { - case_search["_source"]["id"]: case_search for case_search in cases_search.scan() - } - - volume_prefix = f"{reporter_prefix}/{volume.volume_number}" - volume_metadata = put_volume_metadata(dest_bucket, volume, volume_prefix) - - cases_key = f"{volume_prefix}/Cases/" - - # fetch existing files to compare to what we have - s3_contents_hashes = fetch_s3_files(dest_bucket, cases_key) - - # fake Request object used for serializing case with DRF's serializer - vars["fake_request"] = namedtuple("Request", ["query_params", "accepted_renderer"])( - query_params=vars["query_params"], - accepted_renderer=None, - ) - # fake Request object used for serializing cases with DRF's serializer - vars["fake_request"] = namedtuple("Request", ["query_params", "accepted_renderer"])( - query_params={"body_format": "text"}, - accepted_renderer=None, - ) - - # create a metadata contents string to append case metadata content - metadata_contents = "" - - # store the serialized case data - for case in cases: - # identify associated search item to add additional data - try: - item = cases_search_by_id[case.id] - except KeyError: - continue - - serializer = vars["serializer"]( - item["_source"], - context={ - "request": vars["fake_request"], - "first_page_order": case.first_page_order, - "last_page_order": case.last_page_order, - }, - ) - - # add data to metadata_contents string without 'casebody' - metadata_data = serializer.data - metadata_data.pop("casebody", None) - metadata_contents += json.dumps(metadata_data) + "\n" - - # compose each casefile with a hash - case_contents = json.dumps(serializer.data) + "\n" - hash_object = hashlib.sha256(case_contents.encode("utf-8")) - case_contents_hash = base64.b64encode(hash_object.digest()).decode() - - # calculate casefile name - if prev_case_first_page == case.first_page: - case_file_name_index += 1 - else: - case_file_name_index = 1 - case_file_name = ( - f"{case.first_page.zfill(4)}-{str(case_file_name_index).zfill(2)}.json" - ) - - # set so we can use to determine multiple cases on single page - prev_case_first_page = case.first_page - - # identify key: hash pair for current case - dest_key = f"{cases_key}{case_file_name}" - s3_key_hash = s3_contents_hashes.pop(dest_key, None) - - if s3_key_hash is None or s3_key_hash != case_contents_hash: - hash_and_upload( - case_contents, - dest_bucket, - dest_key, - "application/jsonl", - ) - - # remove files from S3 that would otherwise create repeats - for s3_case_key in s3_contents_hashes: - try: - s3_client.delete_object( - Bucket=dest_bucket, - Key=s3_case_key, - ) - except ClientError as err: - if err.response['Error']['Code'] == 'NoSuchKey': - print(f"Couldn't delete {s3_case_key}, no such key") - else: - raise Exception( - f"Couldn't delete {dest_bucket}/{s3_case_key}: %s" % err - ) - - hash_and_upload( - metadata_contents, - dest_bucket, - f"{volume_prefix}/CasesMetadata.jsonl", - "application/jsonl", - ) - - # copies each volume PDF to new location if it doesn't already exist - copy_volume_pdf(volume, volume_prefix, dest_bucket, redacted) - # return metadata for single volume - return volume_metadata - - -# Reporter-specific helper functions - -# Some reporters share a slug, so we have to differentiate with ids -reporter_slug_dict = { - "415": "us-ct-cl", - "657": "wv-ct-cl", - "580": "mass-app-div-annual", - "576": "mass-app-div", -} - - -def put_reporter_metadata(bucket: str, reporter: object, key: str) -> str: - """ - Write a .json file with just the reporter metadata. - Return the line of reporter metadata to be used in all reporters metadata file. - """ - response = requests.get(f"{api_endpoint}reporters/{reporter.id}/") - results = response.json() - - # add additional fields from reporter obj - results["harvard_hollis_id"] = reporter.hollis - - # remove unnecessary fields - results.pop("url", None) - results.pop("frontend_url", None) - try: - for jurisdiction in results["jurisdictions"]: - jurisdiction.pop("slug", None) - jurisdiction.pop("whitelisted", None) - jurisdiction.pop("url", None) - except KeyError as err: - print(f"Cannot pop field {err} because 'jurisdictions' doesn't exist") - - reporter_metadata = json.dumps(results) + "\n" - # add each line to reporters_metadata string - hash_and_upload( - reporter_metadata, bucket, f"{key}/ReporterMetadata.json", "application/json" - ) - return reporter_metadata - - -# Volume-specific helper functions - - -def put_volume_metadata(bucket: str, volume: object, key: str) -> str: - """ - Write a .json file with just the single volume metadata. - """ - response = requests.get(f"{api_endpoint}volumes/{volume.barcode}/") - results = response.json() - # change "barcode" key to "id" key - results["id"] = results.pop("barcode", None) - - # add additional fields from model - results["harvard_hollis_id"] = volume.hollis_number - results["spine_start_year"] = volume.spine_start_year - results["spine_end_year"] = volume.spine_end_year - results["publication_city"] = volume.publication_city - results["second_part_of_id"] = volume.second_part_of_id - - # add information about volume's nominative_reporter - if volume.nominative_reporter_id: - results["nominative_reporter"] = {} - results["nominative_reporter"]["id"] = volume.nominative_reporter_id - results["nominative_reporter"][ - "short_name" - ] = volume.nominative_reporter.short_name - results["nominative_reporter"][ - "full_name" - ] = volume.nominative_reporter.full_name - results["nominative_reporter"][ - "volume_number" - ] = volume.nominative_volume_number - results.pop("nominative_volume_number", None) - results.pop("nominative_name", None) - elif volume.nominative_reporter_id is None and ( - volume.nominative_volume_number or volume.nominative_name - ): - results["nominative_reporter"] = {} - results["nominative_reporter"][ - "volume_number" - ] = volume.nominative_volume_number - results["nominative_reporter"]["nominative_name"] = volume.nominative_name - else: - results["nominative_reporter"] = None - - # remove unnecessary fields - results.pop("reporter", None) - results.pop("reporter_url", None) - results.pop("url", None) - results.pop("pdf_url", None) - results.pop("frontend_url", None) - try: - for jurisdiction in results["jurisdictions"]: - jurisdiction.pop("slug", None) - jurisdiction.pop("whitelisted", None) - jurisdiction.pop("url", None) - except KeyError as err: - print(f"Cannot pop field {err} because 'jurisdictions' doesn't exist") - - volume_metadata = json.dumps(results) + "\n" - hash_and_upload( - volume_metadata, bucket, f"{key}/VolumeMetadata.json", "application/json" - ) - return volume_metadata - - -def copy_volume_pdf( - volume: object, volume_prefix: str, dest_bucket: str, redacted: bool -) -> None: - """ - Copy PDF volume from original location to destination bucket - """ - if redacted: - source_prefix = "pdf/redacted" - else: - source_prefix = "pdf/unredacted" - - try: - s3_client.head_object(Bucket=dest_bucket, Key=f"{volume_prefix}/Volume.pdf") - print(f"{dest_bucket}/{volume_prefix}/Volume.pdf already uploaded!") - except ClientError as err: - if err.response["Error"]["Code"] == "404": - # "With a copy command, the checksum of the object is a direct checksum of the full object." - # https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html - copy_source = { - "Bucket": "harvard-cap-archive", - "Key": f"{source_prefix}/{volume.barcode}.pdf", - } - copy_object_params = { - "Bucket": dest_bucket, - "Key": f"{volume_prefix}/Volume.pdf", - "CopySource": copy_source, - } - - s3_client.copy_object(**copy_object_params) - print( - f"Copied {source_prefix}/{volume.barcode}.pdf to \ - {volume_prefix}/Volume.pdf" - ) - else: - raise Exception( - f"Cannot upload {source_prefix}/{volume.barcode}.pdf to \ - {volume_prefix}/Volume.pdf: %s" - % err - ) - - -# Case-specific helper functions - - -def fetch_s3_files(bucket: str, key: str) -> dict: - """ - Return a dictionary of bucket contents format key: hash - """ - try: - s3_contents_hash = {} - response = s3_client.list_objects_v2(Bucket=bucket, Prefix=key) - except ClientError as err: - raise Exception(f"Cannot list objects {bucket}/{key}: %s" % err) - if "Contents" not in response: - return s3_contents_hash - else: - for case in response["Contents"]: - # Get the object's metadata - try: - response = s3_client.get_object_attributes( - Bucket=bucket, Key=case["Key"], ObjectAttributes=["Checksum"] - ) - - existing_hash = response.get("Checksum", {}).get("ChecksumSHA256") - s3_contents_hash[case["Key"]] = existing_hash - except ClientError as err: - raise Exception(f"Cannot check file {bucket}/{case['Key']}: %s" % err) - - return s3_contents_hash - - -# General helper functions - - -def hash_and_upload(contents: str, bucket: str, key: str, content_type: str) -> None: - """ - Hash created file and upload to S3 - """ - # Calculate the SHA256 hash of the contents data - hash_object = hashlib.sha256(contents.encode("utf-8")) - sha256_hash = base64.b64encode(hash_object.digest()).decode() - # upload file to S3 - try: - s3_client.put_object( - Body=contents, - Bucket=bucket, - Key=key, - ContentType=content_type, - ChecksumSHA256=sha256_hash, - ) - print(f"Completed {key}") - except ClientError as err: - raise Exception(f"Error uploading {key}: %s" % err) - - -def get_bucket_name(redacted: bool) -> str: - """ - Create bucket name based on redaction status - """ - if redacted: - bucket = "cap-redacted" - else: - bucket = "cap-unredacted" - return bucket diff --git a/capstone/scripts/export.py b/capstone/scripts/export.py index c1134a3d0..a499ec337 100644 --- a/capstone/scripts/export.py +++ b/capstone/scripts/export.py @@ -3,7 +3,6 @@ import tempfile import zipfile from io import StringIO -from collections import namedtuple from datetime import date from pathlib import Path from celery import shared_task @@ -13,6 +12,7 @@ from django.utils import timezone from capapi.documents import CaseDocument +from capapi.resources import call_serializer from capapi.serializers import NoLoginCaseDocumentSerializer, CaseDocumentSerializer from capdb.models import Jurisdiction, Reporter from capdb.storages import download_files_storage @@ -155,12 +155,6 @@ def export_case_documents(cases, zip_path, filter_item, public=False): "Bagging-Date: %s\n" ) % (filter_item, timezone.now().strftime("%Y-%m-%d")) - # fake Request object used for serializing cases with DRF's serializer - vars['fake_request'] = namedtuple('Request', ['query_params', 'accepted_renderer'])( - query_params=vars['query_params'], - accepted_renderer=None, - ) - # create new zip file in memory vars['out_spool'] = tempfile.TemporaryFile() vars['archive'] = zipfile.ZipFile(vars['out_spool'], 'w', zipfile.ZIP_STORED) @@ -171,8 +165,8 @@ def export_case_documents(cases, zip_path, filter_item, public=False): # write each case for item in cases.scan(): for format_name, vars in formats.items(): - serializer = vars['serializer'](item['_source'], context={'request': vars['fake_request']}) - vars['compressed_data_file'].write(bytes(json.dumps(serializer.data), 'utf8') + b'\n') + data = call_serializer(vars['serializer'], item['_source'], vars['query_params']) + vars['compressed_data_file'].write(bytes(json.dumps(data), 'utf8') + b'\n') # finish bag for each format for format_name, vars in formats.items(): diff --git a/capstone/scripts/export_cap_static.py b/capstone/scripts/export_cap_static.py new file mode 100644 index 000000000..fbb436188 --- /dev/null +++ b/capstone/scripts/export_cap_static.py @@ -0,0 +1,334 @@ +import shutil +import tempfile +from pathlib import Path + +import boto3 +import json +from botocore.exceptions import ClientError +from celery import shared_task +from django.conf import settings +from django.db import transaction +from tqdm import tqdm + +from capapi.documents import CaseDocument +from capapi.resources import call_serializer +from capapi.serializers import VolumeSerializer, NoLoginCaseDocumentSerializer, ReporterSerializer +from capdb.models import Reporter, VolumeMetadata, Jurisdiction +from scripts.update_snippets import get_map_numbers + + +# steps: +# - export volumes: fab export_cap_static_cases calls export_cases_by_volume() +# - export reporter metadata: fab summarize_cap_static calls finalize_reporters() +# - (not in codebase yet) copy PDFs and captars from one part of S3 to another + + +def finalize_reporters(dest_dir: str) -> None: + """ + """ + dest_dir = Path(dest_dir) + for sub_dir in ("redacted", "unredacted"): + if (dest_dir / sub_dir).exists(): + finalize_reporters_dir(dest_dir / sub_dir) + +def finalize_reporters_dir(dest_dir: Path) -> None: + + # write missing reporter metadata + print("Writing missing reporter metadata") + all_volumes = [] + for reporter_dir in tqdm(dest_dir.iterdir()): + if not reporter_dir.is_dir(): + continue + reporter_metadata_path = reporter_dir / "ReporterMetadata.json" + if reporter_metadata_path.exists(): + continue + + # fetch reporter object + if reporter_dir.name in reporter_slug_dict_reverse: + reporter = Reporter.objects.get(pk=reporter_slug_dict_reverse[reporter_dir.name]) + else: + reporter = Reporter.objects.get(short_name_slug=reporter_dir.name) + + # export reporter metadata + reporter_dict = call_serializer(ReporterSerializer, reporter) + reporter_dict["harvard_hollis_id"] = reporter.hollis + reporter_dict["slug"] = reporter_dir.name + remove_keys(reporter_dict, ["url", "frontend_url", ("jurisdictions", ["slug", "whitelisted", "url"])]) + write_json(reporter_metadata_path, reporter_dict) + + # write reporter-level VolumesMetadata.json + print("Writing VolumesMetadata.json") + volumes_metadata = [json.loads(f.read_text()) for f in reporter_dir.glob("*/VolumeMetadata.json")] + write_json(reporter_dir / "VolumesMetadata.json", volumes_metadata) + all_volumes.extend(volumes_metadata) + + # write ReportersMetadata.json + print("Writing ReportersMetadata.json") + reporters_metadata = [json.loads(f.read_text()) for f in dest_dir.glob("*/ReporterMetadata.json")] + write_json(dest_dir / "ReportersMetadata.json", reporters_metadata) + + # write JurisdictionsMetadata.json + # this is the same data as ReportersMetadata.json, but with a list of reporters for each jurisdiction + # instead of a list of jurisdictions for each reporter + print("Writing JurisdictionsMetadata.json") + jurisdictions = {} + jurisdiction_counts = get_map_numbers() + for jurisdiction in Jurisdiction.objects.all(): + if jurisdiction.slug not in jurisdiction_counts: + continue + jurisdictions[jurisdiction.id] = { + "id": jurisdiction.pk, + "slug": jurisdiction.slug, + "name": jurisdiction.name, + "name_long": jurisdiction.name_long, + **jurisdiction_counts[jurisdiction.slug], + "reporters": [], + } + + for reporter in reporters_metadata: + reporter_jurisdictions = reporter.pop("jurisdictions") + for jurisdiction in reporter_jurisdictions: + jurisdictions[jurisdiction["id"]]["reporters"].append(reporter) + + jurisdictions = [j for j in sorted(jurisdictions.values(), key=lambda j: j["name_long"])] + write_json(dest_dir / "JurisdictionsMetadata.json", jurisdictions) + + # write top-level VolumesMetadata.json + print("Writing VolumesMetadata.json") + write_json(dest_dir / "VolumesMetadata.json", all_volumes) + + +@shared_task +def export_cases_by_volume(volume: str, dest_dir: str) -> None: + volume = VolumeMetadata.objects.select_related("reporter").get(pk=volume) + dest_dir = Path(dest_dir) + export_volume(volume, dest_dir / "redacted") + + # export unredacted version of redacted volumes + if settings.REDACTION_KEY and volume.redacted: + # use a transaction to temporarily unredact the volume, then roll back + with transaction.atomic('capdb'): + volume.unredact(replace_pdf=False) + export_volume(volume, dest_dir / "unredacted") + transaction.set_rollback(True, using='capdb') + +def export_volume(volume: VolumeMetadata, dest_dir: Path) -> None: + """ + Write a .json file for each case per volume. + Write an .html file for each case per volume. + Write a .json file with all case metadata per volume. + Write a .json file with all volume metadata for this collection. + """ + + # set up vars + print("Exporting volume", volume.get_frontend_url()) + reporter_prefix = reporter_slug_dict.get(volume.reporter_id, volume.reporter.short_name_slug) + volume_dir = dest_dir / reporter_prefix / volume.volume_number + + # don't overwrite existing volumes + if volume_dir.exists(): + return + + # find cases to write + cases = list(volume.case_metadatas.filter(in_scope=True).for_indexing().order_by('case_id')) + if not cases: + print(f"WARNING: Volume '{volume.barcode}' contains NO CASES.") + return + + # set up temp volume dir + temp_dir = tempfile.TemporaryDirectory() + temp_volume_dir = Path(temp_dir.name) + cases_dir = temp_volume_dir / "cases" + cases_dir.mkdir() + html_dir = temp_volume_dir / "html" + html_dir.mkdir() + volume_metadata = volume_to_dict(volume) + write_json(temp_volume_dir / "VolumeMetadata.json", volume_metadata) + + # variables for case export loop + case_file_name_index = 1 + prev_case_first_page = None + case_metadatas = [] + case_doc = CaseDocument() + + # store the serialized case data + for case in cases: + # convert case model to search index format + search_item = case_doc.prepare(case) + search_item['last_updated'] = search_item['last_updated'].isoformat() + search_item['decision_date'] = search_item['decision_date'].isoformat() + + # convert search index format to API format + case_data = call_serializer(NoLoginCaseDocumentSerializer, search_item, {"body_format": "text"}) + + # update case_data to match our output format: + if "casebody" in case_data: + case_data["casebody"] = case_data["casebody"]["data"] + case_data["first_page_order"] = case.first_page_order + case_data["last_page_order"] = case.last_page_order + remove_keys(case_data, [ + "reporter", + "volume", + "url", + "frontend_url", + "frontend_pdf_url", + "preview", + ("court", ["slug", "url"]), + ("jurisdiction", ["slug", "whitelisted", "url"]), + ]) + for cite in case_data["cites_to"]: + cite["opinion_index"] = cite.pop("opinion_id") + + # calculate casefile name + first_page = case_data["first_page"] + if prev_case_first_page == first_page: + case_file_name_index += 1 + else: + case_file_name_index = 1 + prev_case_first_page = first_page + case_file_name = f"{first_page:0>4}-{case_file_name_index:0>2}.json" + + # write casefile + write_json(cases_dir / case_file_name, case_data) + + # write metadata without 'casebody' + case_data.pop("casebody", None) + case_metadatas.append(case_data) + + # write html file + html_file_path = (html_dir / case_file_name).with_suffix(".html") + html_file_path.write_text(search_item["casebody_data"]["html"]) + + # write metadata file + write_json(temp_volume_dir / "CasesMetadata.json", case_metadatas) + + # move to real directory + volume_dir.parent.mkdir(exist_ok=True, parents=True) + shutil.copytree(temp_volume_dir, volume_dir) + + +def volume_to_dict(volume: VolumeMetadata) -> dict: + """ + Write a .json file with just the single volume metadata. + """ + volume_data = call_serializer(VolumeSerializer, volume) + + # change "barcode" key to "id" key + volume_data["id"] = volume_data.pop("barcode", None) + + # add additional fields from model + volume_data["harvard_hollis_id"] = volume.hollis_number + volume_data["spine_start_year"] = volume.spine_start_year + volume_data["spine_end_year"] = volume.spine_end_year + volume_data["publication_city"] = volume.publication_city + volume_data["second_part_of_id"] = volume.second_part_of_id + + # add information about volume's nominative_reporter + if volume.nominative_reporter_id: + volume_data["nominative_reporter"] = { + "id": volume.nominative_reporter_id, + "short_name": volume.nominative_reporter.short_name, + "full_name": volume.nominative_reporter.full_name, + "volume_number": volume.nominative_volume_number + } + elif volume.nominative_volume_number or volume.nominative_name: + volume_data["nominative_reporter"] = { + "volume_number": volume.nominative_volume_number, + "nominative_name": volume.nominative_name, + } + else: + volume_data["nominative_reporter"] = None + + # remove unnecessary fields + remove_keys(volume_data, [ + "reporter", + "reporter_url", + "url", + "pdf_url", + "frontend_url", + "nominative_volume_number", + "nominative_name", + ("jurisdictions", ["slug", "whitelisted", "url"]), + ]) + + return volume_data + + +def copy_volume_pdf( + volume: object, volume_prefix: str, dest_bucket: str, redacted: bool +) -> None: + """ + Copy PDF volume from original location to destination bucket + """ + s3_client = boto3.client("s3") + + if redacted: + source_prefix = "pdf/redacted" + else: + source_prefix = "pdf/unredacted" + + try: + s3_client.head_object(Bucket=dest_bucket, Key=f"{volume_prefix}/Volume.pdf") + print(f"{dest_bucket}/{volume_prefix}/Volume.pdf already uploaded!") + except ClientError as err: + if err.response["Error"]["Code"] == "404": + # "With a copy command, the checksum of the object is a direct checksum of the full object." + # https://docs.aws.amazon.com/AmazonS3/latest/userguide/checking-object-integrity.html + copy_source = { + "Bucket": "harvard-cap-archive", + "Key": f"{source_prefix}/{volume.barcode}.pdf", + } + copy_object_params = { + "Bucket": dest_bucket, + "Key": f"{volume_prefix}/Volume.pdf", + "CopySource": copy_source, + } + + s3_client.copy_object(**copy_object_params) + print( + f"Copied {source_prefix}/{volume.barcode}.pdf to \ + {volume_prefix}/Volume.pdf" + ) + else: + raise Exception( + f"Cannot upload {source_prefix}/{volume.barcode}.pdf to \ + {volume_prefix}/Volume.pdf: %s" + % err + ) + + + +### helpers ### + +# Some reporters share a slug, so we have to differentiate with ids +reporter_slug_dict = { + 415: "us-ct-cl", + 657: "wv-ct-cl", + 580: "mass-app-div-annual", + 576: "mass-app-div", +} +reporter_slug_dict_reverse = {v: k for k, v in reporter_slug_dict.items()} + +def remove_keys(results: dict, keys: list) -> dict: + """ + Remove keys from results dict + """ + for key in keys: + if type(key) is tuple: + key, subkeys = key + if key in results: + value = results[key] + if type(value) is list: + for subvalue in value: + remove_keys(subvalue, subkeys) + else: + remove_keys(value, subkeys) + else: + results.pop(key, None) + return results + +def write_json(path: Path, contents) -> None: + """ + Write contents to path + """ + path.write_text(json.dumps(contents, indent=2) + "\n") diff --git a/capstone/scripts/tests/test_cap_static.py b/capstone/scripts/tests/test_cap_static.py new file mode 100644 index 000000000..e90feed2e --- /dev/null +++ b/capstone/scripts/tests/test_cap_static.py @@ -0,0 +1,48 @@ +import shutil +from pathlib import Path + +import pytest +from django.conf import settings + +from capdb.models import VolumeMetadata +from fabfile import export_cap_static_cases, summarize_cap_static, update_elasticsearch_from_queue +from test_data.test_fixtures.helpers import check_path + + +@pytest.mark.django_db(databases=['capdb']) +def test_export_cap_static(case_factory, jurisdiction_factory, redacted_case_factory, volume_metadata_factory, reporter_factory, tmp_path, pytestconfig, elasticsearch, django_assert_num_queries): + # set up a reporter with two volumes, each with three cases + jurisdiction = jurisdiction_factory(name_long="United States", name="U.S.", slug='us') + jurisdiction2 = jurisdiction_factory(name_long="Massachusetts", name="Mass.", slug='mass') + reporter = reporter_factory(full_name="United States Reports", short_name="U.S.", short_name_slug='us') + reporter.jurisdictions.set([jurisdiction, jurisdiction2]) + volumes = [volume_metadata_factory(volume_number=volume_number, reporter=reporter, redacted=True) for volume_number in ("1", "2")] + for volume in volumes: + case_factory(volume=volume, first_page="1", reporter=reporter, jurisdiction=jurisdiction) + case_factory(volume=volume, first_page="2", reporter=reporter, jurisdiction=jurisdiction) + redacted_case_factory(volume=volume, first_page="2", reporter=reporter, jurisdiction=jurisdiction2) + # for some reason case_factory is creating extra volumes, so delete those + VolumeMetadata.objects.exclude(pk__in=[v.pk for v in volumes]).update(out_of_scope=True) + update_elasticsearch_from_queue() + + # run export to temp dir + with django_assert_num_queries(select=37, update=8, insert=2, delete=2, rollback=2): + export_cap_static_cases(dest_dir=str(tmp_path)) + with django_assert_num_queries(select=8): + summarize_cap_static(str(tmp_path)) + + # compare temp dir to test_data/cap_static + cap_static_dir = Path(settings.BASE_DIR, 'test_data/cap_static') + if pytestconfig.getoption('recreate_files'): + # if --recreate-files was passed, copy temp dir to test_data/cap_static instead of checking + if cap_static_dir.exists(): + shutil.rmtree(cap_static_dir) + shutil.copytree(tmp_path, cap_static_dir) + else: + cap_static_paths = [p.relative_to(cap_static_dir) for p in cap_static_dir.rglob('*')] + tmp_paths = [p.relative_to(tmp_path) for p in tmp_path.rglob('*')] + assert cap_static_paths == tmp_paths, "Missing or extra files in cap_static export." + for path in tmp_path.rglob('*'): + if not path.is_file(): + continue + check_path(pytestconfig, path, cap_static_dir / path.relative_to(tmp_path)) diff --git a/capstone/scripts/tests/test_fastcase.py b/capstone/scripts/tests/test_fastcase.py index 78deefcf2..2c2f3d831 100644 --- a/capstone/scripts/tests/test_fastcase.py +++ b/capstone/scripts/tests/test_fastcase.py @@ -13,7 +13,7 @@ from fabfile import refresh_case_body_cache from scripts.fastcase import ingest_fastcase from scripts.fastcase.format_fastcase import segment_paragraphs -from test_data.test_fixtures.helpers import sort_nested_dict +from test_data.test_fixtures.helpers import sort_nested_dict, check_path @pytest.mark.parametrize("input,expected", [ @@ -66,19 +66,6 @@ def test_fastcase_ingest(tmp_path, pytestconfig, elasticsearch): copy_tree(str(fastcase_dir), str(tmp_path)) management.call_command('loaddata', 'capdb/fixtures/jurisdiction.capdb.json.gz', 'capdb/fixtures/reporter.capdb.json.gz', database='capdb') - # helper to check whether files have changed - def check_path(new_contents, saved_path): - if isinstance(new_contents, Path): - new_contents = new_contents.read_text() - old_contents = saved_path.read_text() if saved_path.exists() else '' - if new_contents != old_contents: - if pytestconfig.getoption('recreate_fastcase_files'): - saved_path.write_text(new_contents) - elif old_contents: - assert new_contents == old_contents, f"File {saved_path} has changed. Run pytest -k test_fastcase_ingest --recreate_fastcase_files to update." - else: - assert False, f"File {saved_path} does not exist. Run pytest -k test_fastcase_ingest --recreate_fastcase_files to update." - # run the ingest ingest_fastcase.pack_volumes(tmp_path, recreate=True) ingest_fastcase.main(batch='test_batch', base_dir=tmp_path) @@ -104,10 +91,11 @@ def check_path(new_contents, saved_path): case_data['has_body_cache'] = bool(case.body_cache) cases[case.case_id] = case_data # check case html for changes - check_path(case.body_cache.html, fastcase_dir.joinpath(case.fastcase_import.path).with_suffix('.html')) + check_path(pytestconfig, case.body_cache.html, + fastcase_dir.joinpath(case.fastcase_import.path).with_suffix('.html')) volume_data['cases'] = cases data[volume.pk] = volume_data data = sort_nested_dict(data) # check metadata files for changes - check_path(yaml.dump(data), fastcase_dir / 'data.yml') + check_path(pytestconfig, yaml.dump(data), fastcase_dir / 'data.yml') diff --git a/capstone/scripts/update_snippets.py b/capstone/scripts/update_snippets.py index 4e0ba8643..f5ca6dc8b 100644 --- a/capstone/scripts/update_snippets.py +++ b/capstone/scripts/update_snippets.py @@ -112,6 +112,11 @@ def update_map_numbers(): """ Write map_numbers snippet. """ label = "map_numbers" snippet_format = "application/json" + output = get_map_numbers() + write_update(label, snippet_format, json.dumps(output)) + + +def get_map_numbers(): cursor = connections['capdb'].cursor() cursor.execute(r""" SELECT @@ -121,16 +126,19 @@ def update_map_numbers(): COUNT(DISTINCT c.reporter_id) AS reporter_count, SUM(CASE WHEN (c.first_page||c.last_page)~E'^\\d+$' THEN c.last_page::integer-c.first_page::integer+1 ELSE 1 END) AS page_count FROM capdb_jurisdiction j - LEFT JOIN capdb_casemetadata c ON j.id=c.jurisdiction_id + JOIN capdb_casemetadata c ON j.id=c.jurisdiction_id + JOIN capdb_volumemetadata v ON c.volume_id=v.barcode WHERE c.in_scope IS True + AND v.out_of_scope IS False GROUP BY j.id; """) # get column names from sql query cols = [col[0] for col in cursor.description] # create output where each key is a jurisdiction and each value is a dict of values from the sql query output = {row[0]: dict(zip(cols[1:], row[1:])) for row in cursor.fetchall()} - write_update(label, snippet_format, json.dumps(output)) + return output + def search_jurisdiction_list(): jurisdictions = [ (jurisdiction.slug, jurisdiction.name_long) diff --git a/capstone/test_data/cap_static/redacted/JurisdictionsMetadata.json b/capstone/test_data/cap_static/redacted/JurisdictionsMetadata.json new file mode 100644 index 000000000..459e5a1ac --- /dev/null +++ b/capstone/test_data/cap_static/redacted/JurisdictionsMetadata.json @@ -0,0 +1,44 @@ +[ + { + "id": 2, + "slug": "mass", + "name": "Mass.", + "name_long": "Massachusetts", + "case_count": 2, + "volume_count": 2, + "reporter_count": 1, + "page_count": 10, + "reporters": [ + { + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "harvard_hollis_id": [], + "slug": "us" + } + ] + }, + { + "id": 1, + "slug": "us", + "name": "U.S.", + "name_long": "United States", + "case_count": 4, + "volume_count": 2, + "reporter_count": 1, + "page_count": 20, + "reporters": [ + { + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "harvard_hollis_id": [], + "slug": "us" + } + ] + } +] diff --git a/capstone/test_data/cap_static/redacted/ReportersMetadata.json b/capstone/test_data/cap_static/redacted/ReportersMetadata.json new file mode 100644 index 000000000..d166b62ca --- /dev/null +++ b/capstone/test_data/cap_static/redacted/ReportersMetadata.json @@ -0,0 +1,23 @@ +[ + { + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "harvard_hollis_id": [], + "slug": "us" + } +] diff --git a/capstone/test_data/cap_static/redacted/VolumesMetadata.json b/capstone/test_data/cap_static/redacted/VolumesMetadata.json new file mode 100644 index 000000000..72c2be61a --- /dev/null +++ b/capstone/test_data/cap_static/redacted/VolumesMetadata.json @@ -0,0 +1,60 @@ +[ + [ + { + "volume_number": "1", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "4909170303750", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + }, + { + "volume_number": "2", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "5012832128833", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + } + ] +] diff --git a/capstone/test_data/cap_static/redacted/us/1/CasesMetadata.json b/capstone/test_data/cap_static/redacted/us/1/CasesMetadata.json new file mode 100644 index 000000000..fb954bb3f --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/CasesMetadata.json @@ -0,0 +1,136 @@ +[ + { + "id": 1, + "name": "First Foo0 versus First Bar0", + "name_abbreviation": "Foo0 v. Bar0", + "decision_date": "1900-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "28 U.S. 347" + } + ], + "court": { + "name_abbreviation": "Sound spend.", + "id": 1, + "name": "Father worry common past recognize." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "362 U.S. 816", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 3210483407, + "random_bucket": 5839 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 3, + "last_page_order": 7 + }, + { + "id": 2, + "name": "First Foo1 versus First Bar1", + "name_abbreviation": "Foo1 v. Bar1", + "decision_date": "1901-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "257 U.S. 222" + } + ], + "court": { + "name_abbreviation": "Upon.", + "id": 2, + "name": "Opportunity cup speech." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "934 U.S. 230", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 1950914210, + "random_bucket": 38562 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + }, + { + "id": 3, + "name": "First Foo2 versus First Bar2", + "name_abbreviation": "Foo2 v. Bar2", + "decision_date": "1902-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "311 U.S. 951" + } + ], + "court": { + "name_abbreviation": "Rule six your.", + "id": 3, + "name": "Little perhaps look many." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 2, + "char_count": 14, + "ocr_confidence": 0, + "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89", + "simhash": "1:0000000000000000", + "word_count": 2, + "random_id": 2863198571, + "random_bucket": 61803 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + } +] diff --git a/capstone/test_data/cap_static/redacted/us/1/VolumeMetadata.json b/capstone/test_data/cap_static/redacted/us/1/VolumeMetadata.json new file mode 100644 index 000000000..2f4470a42 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/VolumeMetadata.json @@ -0,0 +1,28 @@ +{ + "volume_number": "1", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "4909170303750", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null +} diff --git a/capstone/test_data/cap_static/redacted/us/1/cases/0001-01.json b/capstone/test_data/cap_static/redacted/us/1/cases/0001-01.json new file mode 100644 index 000000000..d9010b57b --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/cases/0001-01.json @@ -0,0 +1,66 @@ +{ + "id": 1, + "name": "First Foo0 versus First Bar0", + "name_abbreviation": "Foo0 v. Bar0", + "decision_date": "1900-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "28 U.S. 347" + } + ], + "court": { + "name_abbreviation": "Sound spend.", + "id": 1, + "name": "Father worry common past recognize." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "362 U.S. 816", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 3210483407, + "random_bucket": 5839 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [ + "Fearing, C.J., and Korsmo, J., concur." + ], + "parties": [ + "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant." + ], + "opinions": [ + { + "text": "Opinion text", + "type": "majority", + "author": "Pennell, J." + } + ], + "attorneys": [ + "Matthew J. Dudley, for appellant.", + "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent." + ], + "corrections": "", + "head_matter": "head matter" + }, + "first_page_order": 3, + "last_page_order": 7 +} diff --git a/capstone/test_data/cap_static/redacted/us/1/cases/0002-01.json b/capstone/test_data/cap_static/redacted/us/1/cases/0002-01.json new file mode 100644 index 000000000..099ae6342 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/cases/0002-01.json @@ -0,0 +1,66 @@ +{ + "id": 2, + "name": "First Foo1 versus First Bar1", + "name_abbreviation": "Foo1 v. Bar1", + "decision_date": "1901-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "257 U.S. 222" + } + ], + "court": { + "name_abbreviation": "Upon.", + "id": 2, + "name": "Opportunity cup speech." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "934 U.S. 230", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 1950914210, + "random_bucket": 38562 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [ + "Fearing, C.J., and Korsmo, J., concur." + ], + "parties": [ + "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant." + ], + "opinions": [ + { + "text": "Opinion text", + "type": "majority", + "author": "Pennell, J." + } + ], + "attorneys": [ + "Matthew J. Dudley, for appellant.", + "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent." + ], + "corrections": "", + "head_matter": "head matter" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/redacted/us/1/cases/0002-02.json b/capstone/test_data/cap_static/redacted/us/1/cases/0002-02.json new file mode 100644 index 000000000..3dd2a37bc --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/cases/0002-02.json @@ -0,0 +1,58 @@ +{ + "id": 3, + "name": "First Foo2 versus First Bar2", + "name_abbreviation": "Foo2 v. Bar2", + "decision_date": "1902-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "311 U.S. 951" + } + ], + "court": { + "name_abbreviation": "Rule six your.", + "id": 3, + "name": "Little perhaps look many." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 2, + "char_count": 14, + "ocr_confidence": 0, + "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89", + "simhash": "1:0000000000000000", + "word_count": 2, + "random_id": 2863198571, + "random_bucket": 61803 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [], + "opinions": [ + { + "text": "not redacted", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/redacted/us/1/html/0001-01.html b/capstone/test_data/cap_static/redacted/us/1/html/0001-01.html new file mode 100644 index 000000000..5357a97ee --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/html/0001-01.html @@ -0,0 +1 @@ +

Case html 0

\ No newline at end of file diff --git a/capstone/test_data/cap_static/redacted/us/1/html/0002-01.html b/capstone/test_data/cap_static/redacted/us/1/html/0002-01.html new file mode 100644 index 000000000..56e046328 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/html/0002-01.html @@ -0,0 +1 @@ +

Case html 1

\ No newline at end of file diff --git a/capstone/test_data/cap_static/redacted/us/1/html/0002-02.html b/capstone/test_data/cap_static/redacted/us/1/html/0002-02.html new file mode 100644 index 000000000..13bc3490b --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/1/html/0002-02.html @@ -0,0 +1,6 @@ +
+
+
+

not redacted

+
+
diff --git a/capstone/test_data/cap_static/redacted/us/2/CasesMetadata.json b/capstone/test_data/cap_static/redacted/us/2/CasesMetadata.json new file mode 100644 index 000000000..6d1ea2ac7 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/CasesMetadata.json @@ -0,0 +1,136 @@ +[ + { + "id": 4, + "name": "First Foo3 versus First Bar3", + "name_abbreviation": "Foo3 v. Bar3", + "decision_date": "1903-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "447 U.S. 189" + } + ], + "court": { + "name_abbreviation": "Quickly walk.", + "id": 4, + "name": "Focus detail several position." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "524 U.S. 591", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 3673096554, + "random_bucket": 362 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 3, + "last_page_order": 7 + }, + { + "id": 5, + "name": "First Foo4 versus First Bar4", + "name_abbreviation": "Foo4 v. Bar4", + "decision_date": "1904-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "872 U.S. 266" + } + ], + "court": { + "name_abbreviation": "Like area.", + "id": 5, + "name": "Commercial edge agency ground risk." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "596 U.S. 768", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 2595981630, + "random_bucket": 35134 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + }, + { + "id": 6, + "name": "First Foo5 versus First Bar5", + "name_abbreviation": "Foo5 v. Bar5", + "decision_date": "1905-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "958 U.S. 5" + } + ], + "court": { + "name_abbreviation": "Same religious.", + "id": 6, + "name": "Size fish back degree." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 2, + "char_count": 14, + "ocr_confidence": 0, + "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89", + "simhash": "1:0000000000000000", + "word_count": 2, + "random_id": 3120486931, + "random_bucket": 55827 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + } +] diff --git a/capstone/test_data/cap_static/redacted/us/2/VolumeMetadata.json b/capstone/test_data/cap_static/redacted/us/2/VolumeMetadata.json new file mode 100644 index 000000000..34bd2f2f0 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/VolumeMetadata.json @@ -0,0 +1,28 @@ +{ + "volume_number": "2", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "5012832128833", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null +} diff --git a/capstone/test_data/cap_static/redacted/us/2/cases/0001-01.json b/capstone/test_data/cap_static/redacted/us/2/cases/0001-01.json new file mode 100644 index 000000000..f2556140b --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/cases/0001-01.json @@ -0,0 +1,66 @@ +{ + "id": 4, + "name": "First Foo3 versus First Bar3", + "name_abbreviation": "Foo3 v. Bar3", + "decision_date": "1903-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "447 U.S. 189" + } + ], + "court": { + "name_abbreviation": "Quickly walk.", + "id": 4, + "name": "Focus detail several position." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "524 U.S. 591", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 3673096554, + "random_bucket": 362 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [ + "Fearing, C.J., and Korsmo, J., concur." + ], + "parties": [ + "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant." + ], + "opinions": [ + { + "text": "Opinion text", + "type": "majority", + "author": "Pennell, J." + } + ], + "attorneys": [ + "Matthew J. Dudley, for appellant.", + "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent." + ], + "corrections": "", + "head_matter": "head matter" + }, + "first_page_order": 3, + "last_page_order": 7 +} diff --git a/capstone/test_data/cap_static/redacted/us/2/cases/0002-01.json b/capstone/test_data/cap_static/redacted/us/2/cases/0002-01.json new file mode 100644 index 000000000..db6983ef4 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/cases/0002-01.json @@ -0,0 +1,66 @@ +{ + "id": 5, + "name": "First Foo4 versus First Bar4", + "name_abbreviation": "Foo4 v. Bar4", + "decision_date": "1904-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "872 U.S. 266" + } + ], + "court": { + "name_abbreviation": "Like area.", + "id": 5, + "name": "Commercial edge agency ground risk." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [ + { + "cite": "596 U.S. 768", + "category": null, + "reporter": null, + "opinion_index": -1 + } + ], + "analysis": { + "random_id": 2595981630, + "random_bucket": 35134 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [ + "Fearing, C.J., and Korsmo, J., concur." + ], + "parties": [ + "In the Matter of the Marriage of Christy Lyle, Respondent, and Keith Lyle, Appellant." + ], + "opinions": [ + { + "text": "Opinion text", + "type": "majority", + "author": "Pennell, J." + } + ], + "attorneys": [ + "Matthew J. Dudley, for appellant.", + "Camerina I. Brokaw-Zorrozua (of Maxey Law Office PS), for respondent." + ], + "corrections": "", + "head_matter": "head matter" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/redacted/us/2/cases/0002-02.json b/capstone/test_data/cap_static/redacted/us/2/cases/0002-02.json new file mode 100644 index 000000000..007da6102 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/cases/0002-02.json @@ -0,0 +1,58 @@ +{ + "id": 6, + "name": "First Foo5 versus First Bar5", + "name_abbreviation": "Foo5 v. Bar5", + "decision_date": "1905-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "958 U.S. 5" + } + ], + "court": { + "name_abbreviation": "Same religious.", + "id": 6, + "name": "Size fish back degree." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 2, + "char_count": 14, + "ocr_confidence": 0, + "sha256": "8874c1d06c05904e12fef4e76fcdd0ce48b33013e2067d5ebba936f78b8c5d89", + "simhash": "1:0000000000000000", + "word_count": 2, + "random_id": 3120486931, + "random_bucket": 55827 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [], + "opinions": [ + { + "text": "not redacted", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/redacted/us/2/html/0001-01.html b/capstone/test_data/cap_static/redacted/us/2/html/0001-01.html new file mode 100644 index 000000000..682bfc16f --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/html/0001-01.html @@ -0,0 +1 @@ +

Case html 3

\ No newline at end of file diff --git a/capstone/test_data/cap_static/redacted/us/2/html/0002-01.html b/capstone/test_data/cap_static/redacted/us/2/html/0002-01.html new file mode 100644 index 000000000..75d3446a8 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/html/0002-01.html @@ -0,0 +1 @@ +

Case html 4

\ No newline at end of file diff --git a/capstone/test_data/cap_static/redacted/us/2/html/0002-02.html b/capstone/test_data/cap_static/redacted/us/2/html/0002-02.html new file mode 100644 index 000000000..75cea3a52 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/2/html/0002-02.html @@ -0,0 +1,6 @@ +
+
+
+

not redacted

+
+
diff --git a/capstone/test_data/cap_static/redacted/us/ReporterMetadata.json b/capstone/test_data/cap_static/redacted/us/ReporterMetadata.json new file mode 100644 index 000000000..85516ff50 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/ReporterMetadata.json @@ -0,0 +1,21 @@ +{ + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "harvard_hollis_id": [], + "slug": "us" +} diff --git a/capstone/test_data/cap_static/redacted/us/VolumesMetadata.json b/capstone/test_data/cap_static/redacted/us/VolumesMetadata.json new file mode 100644 index 000000000..1a24582d4 --- /dev/null +++ b/capstone/test_data/cap_static/redacted/us/VolumesMetadata.json @@ -0,0 +1,58 @@ +[ + { + "volume_number": "1", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "4909170303750", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + }, + { + "volume_number": "2", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "5012832128833", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + } +] diff --git a/capstone/test_data/cap_static/unredacted/JurisdictionsMetadata.json b/capstone/test_data/cap_static/unredacted/JurisdictionsMetadata.json new file mode 100644 index 000000000..459e5a1ac --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/JurisdictionsMetadata.json @@ -0,0 +1,44 @@ +[ + { + "id": 2, + "slug": "mass", + "name": "Mass.", + "name_long": "Massachusetts", + "case_count": 2, + "volume_count": 2, + "reporter_count": 1, + "page_count": 10, + "reporters": [ + { + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "harvard_hollis_id": [], + "slug": "us" + } + ] + }, + { + "id": 1, + "slug": "us", + "name": "U.S.", + "name_long": "United States", + "case_count": 4, + "volume_count": 2, + "reporter_count": 1, + "page_count": 20, + "reporters": [ + { + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "harvard_hollis_id": [], + "slug": "us" + } + ] + } +] diff --git a/capstone/test_data/cap_static/unredacted/ReportersMetadata.json b/capstone/test_data/cap_static/unredacted/ReportersMetadata.json new file mode 100644 index 000000000..d166b62ca --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/ReportersMetadata.json @@ -0,0 +1,23 @@ +[ + { + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "harvard_hollis_id": [], + "slug": "us" + } +] diff --git a/capstone/test_data/cap_static/unredacted/VolumesMetadata.json b/capstone/test_data/cap_static/unredacted/VolumesMetadata.json new file mode 100644 index 000000000..72c2be61a --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/VolumesMetadata.json @@ -0,0 +1,60 @@ +[ + [ + { + "volume_number": "1", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "4909170303750", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + }, + { + "volume_number": "2", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "5012832128833", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + } + ] +] diff --git a/capstone/test_data/cap_static/unredacted/us/1/CasesMetadata.json b/capstone/test_data/cap_static/unredacted/us/1/CasesMetadata.json new file mode 100644 index 000000000..f5ef1c8ff --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/CasesMetadata.json @@ -0,0 +1,134 @@ +[ + { + "id": 1, + "name": "First Foo0 versus First Bar0", + "name_abbreviation": "Foo0 v. Bar0", + "decision_date": "1900-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "28 U.S. 347" + } + ], + "court": { + "name_abbreviation": "Sound spend.", + "id": 1, + "name": "Father worry common past recognize." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 3210483407, + "random_bucket": 5839 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 3, + "last_page_order": 7 + }, + { + "id": 2, + "name": "First Foo1 versus First Bar1", + "name_abbreviation": "Foo1 v. Bar1", + "decision_date": "1901-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "257 U.S. 222" + } + ], + "court": { + "name_abbreviation": "Upon.", + "id": 2, + "name": "Opportunity cup speech." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 1950914210, + "random_bucket": 38562 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + }, + { + "id": 3, + "name": "First Foo2 versus First Bar2", + "name_abbreviation": "Foo2 v. Bar2", + "decision_date": "1902-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "311 U.S. 951" + } + ], + "court": { + "name_abbreviation": "Rule six your.", + "id": 3, + "name": "Little perhaps look many." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 39, + "ocr_confidence": 0, + "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5", + "simhash": "1:03208952f875022c", + "word_count": 8, + "random_id": 2863198571, + "random_bucket": 61803 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + } +] diff --git a/capstone/test_data/cap_static/unredacted/us/1/VolumeMetadata.json b/capstone/test_data/cap_static/unredacted/us/1/VolumeMetadata.json new file mode 100644 index 000000000..2f4470a42 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/VolumeMetadata.json @@ -0,0 +1,28 @@ +{ + "volume_number": "1", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "4909170303750", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null +} diff --git a/capstone/test_data/cap_static/unredacted/us/1/cases/0001-01.json b/capstone/test_data/cap_static/unredacted/us/1/cases/0001-01.json new file mode 100644 index 000000000..151f59be7 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/cases/0001-01.json @@ -0,0 +1,60 @@ +{ + "id": 1, + "name": "First Foo0 versus First Bar0", + "name_abbreviation": "Foo0 v. Bar0", + "decision_date": "1900-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "28 U.S. 347" + } + ], + "court": { + "name_abbreviation": "Sound spend.", + "id": 1, + "name": "Father worry common past recognize." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 3210483407, + "random_bucket": 5839 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [ + "Case text 0" + ], + "opinions": [ + { + "text": "Case text 1Case text 2\nCase text 3", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "Case text 0" + }, + "first_page_order": 3, + "last_page_order": 7 +} diff --git a/capstone/test_data/cap_static/unredacted/us/1/cases/0002-01.json b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-01.json new file mode 100644 index 000000000..e6350e473 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-01.json @@ -0,0 +1,60 @@ +{ + "id": 2, + "name": "First Foo1 versus First Bar1", + "name_abbreviation": "Foo1 v. Bar1", + "decision_date": "1901-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "257 U.S. 222" + } + ], + "court": { + "name_abbreviation": "Upon.", + "id": 2, + "name": "Opportunity cup speech." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 1950914210, + "random_bucket": 38562 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [ + "Case text 0" + ], + "opinions": [ + { + "text": "Case text 1Case text 2\nCase text 3", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "Case text 0" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/unredacted/us/1/cases/0002-02.json b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-02.json new file mode 100644 index 000000000..23937392e --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/cases/0002-02.json @@ -0,0 +1,60 @@ +{ + "id": 3, + "name": "First Foo2 versus First Bar2", + "name_abbreviation": "Foo2 v. Bar2", + "decision_date": "1902-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "311 U.S. 951" + } + ], + "court": { + "name_abbreviation": "Rule six your.", + "id": 3, + "name": "Little perhaps look many." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 39, + "ocr_confidence": 0, + "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5", + "simhash": "1:03208952f875022c", + "word_count": 8, + "random_id": 2863198571, + "random_bucket": 61803 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [ + "Text 1" + ], + "opinions": [ + { + "text": "Text 2Text 3not redacted\nText 4", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "Text 1" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/unredacted/us/1/html/0001-01.html b/capstone/test_data/cap_static/unredacted/us/1/html/0001-01.html new file mode 100644 index 000000000..46b815f9c --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/html/0001-01.html @@ -0,0 +1,12 @@ +
+
+

Case text 0

+
+
+

Case text 1Case text 2

+ +
+
diff --git a/capstone/test_data/cap_static/unredacted/us/1/html/0002-01.html b/capstone/test_data/cap_static/unredacted/us/1/html/0002-01.html new file mode 100644 index 000000000..bbefaa490 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/html/0002-01.html @@ -0,0 +1,12 @@ +
+
+

Case text 0

+
+
+

Case text 1Case text 2

+ +
+
diff --git a/capstone/test_data/cap_static/unredacted/us/1/html/0002-02.html b/capstone/test_data/cap_static/unredacted/us/1/html/0002-02.html new file mode 100644 index 000000000..9d537bcb2 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/1/html/0002-02.html @@ -0,0 +1,15 @@ +
+
+

Text 1

+
+
+

Text 2Text 3not redacted

+

+ +

+ +
+
diff --git a/capstone/test_data/cap_static/unredacted/us/2/CasesMetadata.json b/capstone/test_data/cap_static/unredacted/us/2/CasesMetadata.json new file mode 100644 index 000000000..29ed827d1 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/CasesMetadata.json @@ -0,0 +1,134 @@ +[ + { + "id": 4, + "name": "First Foo3 versus First Bar3", + "name_abbreviation": "Foo3 v. Bar3", + "decision_date": "1903-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "447 U.S. 189" + } + ], + "court": { + "name_abbreviation": "Quickly walk.", + "id": 4, + "name": "Focus detail several position." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 3673096554, + "random_bucket": 362 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 3, + "last_page_order": 7 + }, + { + "id": 5, + "name": "First Foo4 versus First Bar4", + "name_abbreviation": "Foo4 v. Bar4", + "decision_date": "1904-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "872 U.S. 266" + } + ], + "court": { + "name_abbreviation": "Like area.", + "id": 5, + "name": "Commercial edge agency ground risk." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 2595981630, + "random_bucket": 35134 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + }, + { + "id": 6, + "name": "First Foo5 versus First Bar5", + "name_abbreviation": "Foo5 v. Bar5", + "decision_date": "1905-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "958 U.S. 5" + } + ], + "court": { + "name_abbreviation": "Same religious.", + "id": 6, + "name": "Size fish back degree." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 39, + "ocr_confidence": 0, + "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5", + "simhash": "1:03208952f875022c", + "word_count": 8, + "random_id": 3120486931, + "random_bucket": 55827 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "first_page_order": 4, + "last_page_order": 8 + } +] diff --git a/capstone/test_data/cap_static/unredacted/us/2/VolumeMetadata.json b/capstone/test_data/cap_static/unredacted/us/2/VolumeMetadata.json new file mode 100644 index 000000000..34bd2f2f0 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/VolumeMetadata.json @@ -0,0 +1,28 @@ +{ + "volume_number": "2", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "5012832128833", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null +} diff --git a/capstone/test_data/cap_static/unredacted/us/2/cases/0001-01.json b/capstone/test_data/cap_static/unredacted/us/2/cases/0001-01.json new file mode 100644 index 000000000..847039168 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/cases/0001-01.json @@ -0,0 +1,60 @@ +{ + "id": 4, + "name": "First Foo3 versus First Bar3", + "name_abbreviation": "Foo3 v. Bar3", + "decision_date": "1903-01-01", + "docket_number": "", + "first_page": "1", + "last_page": "5", + "citations": [ + { + "type": "official", + "cite": "447 U.S. 189" + } + ], + "court": { + "name_abbreviation": "Quickly walk.", + "id": 4, + "name": "Focus detail several position." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 3673096554, + "random_bucket": 362 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [ + "Case text 0" + ], + "opinions": [ + { + "text": "Case text 1Case text 2\nCase text 3", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "Case text 0" + }, + "first_page_order": 3, + "last_page_order": 7 +} diff --git a/capstone/test_data/cap_static/unredacted/us/2/cases/0002-01.json b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-01.json new file mode 100644 index 000000000..5ec8f1fc9 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-01.json @@ -0,0 +1,60 @@ +{ + "id": 5, + "name": "First Foo4 versus First Bar4", + "name_abbreviation": "Foo4 v. Bar4", + "decision_date": "1904-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "872 U.S. 266" + } + ], + "court": { + "name_abbreviation": "Like area.", + "id": 5, + "name": "Commercial edge agency ground risk." + }, + "jurisdiction": { + "id": 1, + "name_long": "United States", + "name": "U.S." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 47, + "ocr_confidence": 1.0, + "sha256": "da95df9d6d5d506285c9a8f9010560fa57905f64b3e94748b8854d678e18f0cc", + "simhash": "1:6e45862a08eb1d4c", + "word_count": 11, + "random_id": 2595981630, + "random_bucket": 35134 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [ + "Case text 0" + ], + "opinions": [ + { + "text": "Case text 1Case text 2\nCase text 3", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "Case text 0" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/unredacted/us/2/cases/0002-02.json b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-02.json new file mode 100644 index 000000000..68917d4e2 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/cases/0002-02.json @@ -0,0 +1,60 @@ +{ + "id": 6, + "name": "First Foo5 versus First Bar5", + "name_abbreviation": "Foo5 v. Bar5", + "decision_date": "1905-01-01", + "docket_number": "", + "first_page": "2", + "last_page": "6", + "citations": [ + { + "type": "official", + "cite": "958 U.S. 5" + } + ], + "court": { + "name_abbreviation": "Same religious.", + "id": 6, + "name": "Size fish back degree." + }, + "jurisdiction": { + "id": 2, + "name_long": "Massachusetts", + "name": "Mass." + }, + "cites_to": [], + "analysis": { + "cardinality": 6, + "char_count": 39, + "ocr_confidence": 0, + "sha256": "e0819f285636dcbd644be2d72c1ef1e0e616ca51d3445280adbf00eab401e7c5", + "simhash": "1:03208952f875022c", + "word_count": 8, + "random_id": 3120486931, + "random_bucket": 55827 + }, + "last_updated": "2023-12-04T18:17:29.088002+00:00", + "provenance": { + "date_added": "2023-12-04", + "source": "Harvard", + "batch": "2018" + }, + "casebody": { + "judges": [], + "parties": [ + "Text 1" + ], + "opinions": [ + { + "text": "Text 2Text 3not redacted\nText 4", + "type": "majority", + "author": null + } + ], + "attorneys": [], + "corrections": "", + "head_matter": "Text 1" + }, + "first_page_order": 4, + "last_page_order": 8 +} diff --git a/capstone/test_data/cap_static/unredacted/us/2/html/0001-01.html b/capstone/test_data/cap_static/unredacted/us/2/html/0001-01.html new file mode 100644 index 000000000..0449b9be6 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/html/0001-01.html @@ -0,0 +1,12 @@ +
+
+

Case text 0

+
+
+

Case text 1Case text 2

+ +
+
diff --git a/capstone/test_data/cap_static/unredacted/us/2/html/0002-01.html b/capstone/test_data/cap_static/unredacted/us/2/html/0002-01.html new file mode 100644 index 000000000..edcdd9fac --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/html/0002-01.html @@ -0,0 +1,12 @@ +
+
+

Case text 0

+
+
+

Case text 1Case text 2

+ +
+
diff --git a/capstone/test_data/cap_static/unredacted/us/2/html/0002-02.html b/capstone/test_data/cap_static/unredacted/us/2/html/0002-02.html new file mode 100644 index 000000000..accac1425 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/2/html/0002-02.html @@ -0,0 +1,15 @@ +
+
+

Text 1

+
+
+

Text 2Text 3not redacted

+

+ +

+ +
+
diff --git a/capstone/test_data/cap_static/unredacted/us/ReporterMetadata.json b/capstone/test_data/cap_static/unredacted/us/ReporterMetadata.json new file mode 100644 index 000000000..85516ff50 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/ReporterMetadata.json @@ -0,0 +1,21 @@ +{ + "id": 1, + "full_name": "United States Reports", + "short_name": "U.S.", + "start_year": 1900, + "end_year": 2000, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "harvard_hollis_id": [], + "slug": "us" +} diff --git a/capstone/test_data/cap_static/unredacted/us/VolumesMetadata.json b/capstone/test_data/cap_static/unredacted/us/VolumesMetadata.json new file mode 100644 index 000000000..1a24582d4 --- /dev/null +++ b/capstone/test_data/cap_static/unredacted/us/VolumesMetadata.json @@ -0,0 +1,58 @@ +[ + { + "volume_number": "1", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "4909170303750", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + }, + { + "volume_number": "2", + "title": null, + "publisher": null, + "publication_year": null, + "start_year": null, + "end_year": null, + "series_volume_number": null, + "jurisdictions": [ + { + "id": 2, + "name": "Mass.", + "name_long": "Massachusetts" + }, + { + "id": 1, + "name": "U.S.", + "name_long": "United States" + } + ], + "id": "5012832128833", + "harvard_hollis_id": null, + "spine_start_year": null, + "spine_end_year": null, + "publication_city": null, + "second_part_of_id": null, + "nominative_reporter": null + } +] diff --git a/capstone/test_data/test_fixtures/fixtures.py b/capstone/test_data/test_fixtures/fixtures.py index 9ee4a6954..7b7f37805 100644 --- a/capstone/test_data/test_fixtures/fixtures.py +++ b/capstone/test_data/test_fixtures/fixtures.py @@ -33,7 +33,7 @@ ### Pytest setup ### def pytest_addoption(parser): - parser.addoption("--recreate_fastcase_files", action="store_true", default=False, help="Recreate files in test_data/fastcase rather than testing existing files") + parser.addoption("--recreate_files", action="store_true", default=False, help="Recreate files in test_data/ rather than testing existing files") ### Database setup ### @@ -410,3 +410,101 @@ def urls(live_server): @pytest.fixture def map_data(): management.call_command('loaddata', ('jurisdiction', 'reporter', 'snippet'), database='capdb') + + +@pytest.fixture +def redacted_case_factory(case_factory): + def factory(**kwargs): + # set up a redacted case + case = case_factory(volume__redacted=True, volume__pdf_file="redacted_volume.pdf", **kwargs) + structure = case.structure + page = structure.pages.first() + structure.opinions = [ + # redacted paragraph + { + "type": "head", + "paragraphs": [ + { + "class": "parties", + "block_ids": ["BL_1.1"], + "id": "b1-1", + "redacted": True, + } + ], + }, + { + "type": "majority", + "paragraphs": [ + # redacted content blocks + { + "class": "p", + "block_ids": ["BL_1.2", "BL_1.3"], + "id": "b1-2", + }, + # redacted image block + { + "class": "image", + "block_ids": ["BL_1.4"], + "id": "b1-3", + }, + ], + # redacted footnote + "footnotes": [ + { + # redacted footnote paragraph + "paragraphs": [ + { + "class": "p", + "block_ids": ["BL_1.5"], + "id": "b1-4", + } + ], + "label": "1", + "id": "footnote_1_1", + "redacted": True, + } + ], + }, + ] + structure.save() + page.blocks = [ + { + "id": "BL_1.1", + "class": "p", + "tokens": ["Text 1"], + "rect": [25, 11, 300, 490], + }, + { + "id": "BL_1.2", + "class": "p", + "tokens": ["Text 2"], + "redacted": True, + "rect": [4, 32, 100, 100], + }, + { + "id": "BL_1.3", + "class": "p", + "tokens": [["redact"], "Text 3", ["/redact"], "not redacted"], + "rect": [225, 11, 430, 290], + }, + { + "id": "BL_1.4", + "format": "image", + "redacted": True, + "class": "image", + "data": "image data", + "rect": [0, 0, 100, 100], + }, + { + "id": "BL_1.5", + "class": "p", + "tokens": ["Text 4"], + "rect": [190, 312, 330, 490], + }, + ] + page.encrypt() + page.save() + case.sync_case_body_cache() + case.refresh_from_db() + return case + return factory diff --git a/capstone/test_data/test_fixtures/helpers.py b/capstone/test_data/test_fixtures/helpers.py index d9d1327ff..0ef0448e9 100644 --- a/capstone/test_data/test_fixtures/helpers.py +++ b/capstone/test_data/test_fixtures/helpers.py @@ -1,5 +1,6 @@ import difflib import hashlib +import os from pathlib import Path from scripts.helpers import parse_xml, parse_html @@ -114,3 +115,24 @@ def sort_nested_dict(d): if isinstance(d, (list, tuple)): return [sort_nested_dict(v) for v in d] return d + + +def current_test_name(): + return os.environ.get('PYTEST_CURRENT_TEST').split(':')[-1].split(' ')[0] + + +def check_path(pytestconfig, new_contents, saved_path): + """ + Either report a diff between new_contents and saved_path, or update saved_path to match new_contents, depending on pytest --recreate_files. + """ + if isinstance(new_contents, Path): + new_contents = new_contents.read_text() + old_contents = saved_path.read_text() if saved_path.exists() else '' + if new_contents != old_contents: + if pytestconfig.getoption('recreate_files'): + saved_path.parent.mkdir(parents=True, exist_ok=True) + saved_path.write_text(new_contents) + elif old_contents: + assert new_contents == old_contents, f"File {saved_path} has changed. Run pytest -k {current_test_name()} --recreate_files to update." + else: + assert False, f"File {saved_path} does not exist. Run pytest -k {current_test_name()} --recreate_files to update."