Skip to content
This repository has been archived by the owner on Oct 16, 2024. It is now read-only.

Commit

Permalink
Update convert_s3 script to export_cap_static
Browse files Browse the repository at this point in the history
- Writes to local disk
- Includes HTML
- Includes unredacted versions
- Includeds jurisdiction metadata
- Formatting updates
  • Loading branch information
jcushman committed Dec 4, 2023
1 parent c65ee2f commit e2c436e
Show file tree
Hide file tree
Showing 56 changed files with 2,465 additions and 712 deletions.
11 changes: 10 additions & 1 deletion capstone/capapi/resources.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import hashlib
import concurrent.futures
from copy import copy
from copy import copy
from functools import reduce

import rest_framework.request
Expand All @@ -13,6 +13,7 @@
from django.db.models import QuerySet
from django.http import QueryDict
from django.test.utils import CaptureQueriesContext
from django.test.client import RequestFactory
from django.utils.functional import SimpleLazyObject
from django_hosts import reverse as django_hosts_reverse
from elasticsearch import Elasticsearch
Expand Down Expand Up @@ -269,3 +270,11 @@ def api_request(request, viewset, method, url_kwargs={}, get_params={}):
api_request.GET.update(get_params)

return viewset.as_view({'get': method})(api_request, **url_kwargs)


def call_serializer(Serializer, item, query_params=None):
"""
Make a fake DRF request so we can call a DRF serializer with the expected context.
"""
request = rest_framework.request.Request(RequestFactory().get('/', query_params))
return Serializer(item, context={'request': request}).data
42 changes: 0 additions & 42 deletions capstone/capapi/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,49 +546,9 @@ def data(self):
return super(DocumentSerializer, self).data


class ConvertCaseDocumentSerializer(CaseDocumentSerializer):
first_page_order = serializers.CharField()
last_page_order = serializers.CharField()

def to_representation(self, instance):
first_page_order = self.context.get("first_page_order")
last_page_order = self.context.get("last_page_order")

data = super().to_representation(instance)

data.pop("reporter", None)
data.pop("volume", None)
data.pop("url", None)
data.pop("frontend_url", None)
data.pop("frontend_pdf_url", None)
try:
data["court"].pop("slug", None)
data["court"].pop("url", None)
except KeyError as err:
print(f"Cannot pop field {err} because 'court' doesn't exist")
try:
data["jurisdiction"].pop("slug", None)
data["jurisdiction"].pop("whitelisted", None)
data["jurisdiction"].pop("url", None)
except KeyError as err:
print(f"Cannot pop field {err} because 'jurisdiction' doesn't exist")

if "preview" in data:
data.pop("preview")
data["first_page_order"] = first_page_order
data["last_page_order"] = last_page_order
return data


class ConvertNoLoginCaseDocumentSerializer(CaseDocumentSerializerWithCasebody):
first_page_order = serializers.CharField()
last_page_order = serializers.CharField()

def to_representation(self, instance, check_permissions=False):
"""Tell get_casebody not to check for case download permissions."""
first_page_order = self.context.get("first_page_order")
last_page_order = self.context.get("last_page_order")

data = super().to_representation(instance, check_permissions=check_permissions)
try:
data["casebody"] = data["casebody"]["data"]
Expand All @@ -615,8 +575,6 @@ def to_representation(self, instance, check_permissions=False):

data.pop("preview", None)

data["first_page_order"] = first_page_order
data["last_page_order"] = last_page_order
return data

@property
Expand Down
111 changes: 12 additions & 99 deletions capstone/capdb/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,96 +93,8 @@ def test_volume_save_slug_update(volume_metadata):


@pytest.mark.django_db(databases=["capdb"])
def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
# set up a redacted case
case = case_factory(volume__redacted=True, volume__pdf_file="redacted_volume.pdf")
structure = case.structure
page = structure.pages.first()
structure.opinions = [
# redacted paragraph
{
"type": "head",
"paragraphs": [
{
"class": "parties",
"block_ids": ["BL_1.1"],
"id": "b1-1",
"redacted": True,
}
],
},
{
"type": "majority",
"paragraphs": [
# redacted content blocks
{
"class": "p",
"block_ids": ["BL_1.2", "BL_1.3"],
"id": "b1-2",
},
# redacted image block
{
"class": "image",
"block_ids": ["BL_1.4"],
"id": "b1-3",
},
],
# redacted footnote
"footnotes": [
{
# redacted footnote paragraph
"paragraphs": [
{
"class": "p",
"block_ids": ["BL_1.5"],
"id": "b1-4",
}
],
"label": "1",
"id": "footnote_1_1",
"redacted": True,
}
],
},
]
structure.save()
page.blocks = [
{
"id": "BL_1.1",
"class": "p",
"tokens": ["Text 1"],
"rect": [25, 11, 300, 490],
},
{
"id": "BL_1.2",
"class": "p",
"tokens": ["Text 2"],
"redacted": True,
"rect": [4, 32, 100, 100],
},
{
"id": "BL_1.3",
"class": "p",
"tokens": [["redact"], "Text 3", ["/redact"]],
"rect": [225, 11, 430, 290],
},
{
"id": "BL_1.4",
"format": "image",
"redacted": True,
"class": "image",
"data": "image data",
"rect": [0, 0, 100, 100],
},
{
"id": "BL_1.5",
"class": "p",
"tokens": ["Text 4"],
"rect": [190, 312, 330, 490],
},
]
page.encrypt()
page.save()
def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path, redacted_case_factory):
case = redacted_case_factory()

# set up volume pdfs
volume = case.volume
Expand All @@ -194,23 +106,24 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
download_files_storage.save(volume.pdf_file.name, StringIO("redacted"))

# verify redacted case contents
case.sync_case_body_cache()
case.refresh_from_db()
assert case.body_cache.text == "\n\n"
assert case.body_cache.text == "\nnot redacted\n"
assert xml_equal(
case.body_cache.html,
'<section class="casebody" data-case-id="00000000" data-firstpage="4" data-lastpage="8">\n'
' <section class="head-matter"/>\n'
' <article class="opinion" data-type="majority"/>\n'
"</section>",
'<section class="casebody" data-case-id="00000000" data-firstpage="4" '
'data-lastpage="8">\n'
' <section class="head-matter"></section>\n'
' <article class="opinion" data-type="majority">\n'
' <p id="b1-2" data-blocks=\'[["BL_1.2",0,[4,32,100,100]],["BL_1.3",0,[225,11,430,290]]]\'>not redacted</p>\n'
' </article>\n'
'</section>\n',
)

# unredact
volume.unredact()
volume.refresh_from_db()
case.body_cache.refresh_from_db()
assert volume.redacted is False
assert case.body_cache.text == "Text 1\nText 2Text 3\nText 4\n"
assert case.body_cache.text == "Text 1\nText 2Text 3not redacted\nText 4\n"
assert html_equal(
case.body_cache.html,
dedent(
Expand All @@ -220,7 +133,7 @@ def test_volume_unredact(reset_sequences, case_factory, monkeypatch, tmp_path):
<h4 class="parties" id="b1-1" data-blocks='[["BL_1.1",0,[25,11,300,490]]]'>Text 1</h4>
</section>
<article class="opinion" data-type="majority">
<p id="b1-2" data-blocks='[["BL_1.2",0,[4,32,100,100]],["BL_1.3",0,[225,11,430,290]]]'>Text 2Text 3</p>
<p id="b1-2" data-blocks='[["BL_1.2",0,[4,32,100,100]],["BL_1.3",0,[225,11,430,290]]]'>Text 2Text 3not redacted</p>
<p class="image" id="b1-3" data-blocks='[["BL_1.4",0,[0,0,100,100]]]'>
<img src="data:image%20data" class="image" width="100" height="100">
</p>
Expand Down
2 changes: 1 addition & 1 deletion capstone/config/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
'scripts.update_snippets',
'scripts.refactor_xml',
'scripts.make_pdf',
'scripts.convert_s3',
'scripts.export_cap_static',
])

# Using a string here means the worker doesn't have to serialize
Expand Down
4 changes: 4 additions & 0 deletions capstone/config/settings/settings_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,10 @@ def immutable_file_test(path, url):
},
}
CELERY_TIMEZONE = 'UTC'
CELERY_TASK_ROUTES = {
"scripts.export_cap_static.export_cases_by_volume": {"queue": "cap_static"},
}


### CAP API settings ###

Expand Down
39 changes: 20 additions & 19 deletions capstone/fabfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import subprocess
import sys
import tempfile
import traceback
from collections import defaultdict
from contextlib import contextmanager
from datetime import datetime
Expand Down Expand Up @@ -61,8 +62,8 @@
validate_private_volumes as validate_private_volumes_script,
export,
update_snippets,
export_cap_static,
)
from scripts import convert_s3
from scripts.helpers import (
copy_file,
volume_barcode_from_folder,
Expand Down Expand Up @@ -298,6 +299,7 @@ def import_web_volumes():
try:
import_volume(f.name)
except IntegrityError:
traceback.print_exc()
print(" - integrity error; volume already imported? skipping")


Expand Down Expand Up @@ -452,31 +454,30 @@ def retry_export_cases(version_string):


@task
def export_cases_to_s3(reporter="528"):
def export_cap_static_cases(dest_dir="/tmp/cap_exports", reporter=None, volume=None, last_run_before=None):
"""
Export a version to S3 of all cases' texts and metadata
by reporter and volume.
First step of the static files export process: export cases, one celery task per volume.
"""
redacted = True
bucket = convert_s3.get_bucket_name(redacted)
convert_s3.export_cases_to_s3(bucket, redacted, reporter)


@task
def export_reporters_to_s3():
"""
Run export of all reporters and their contents to S3.
"""
convert_s3.put_reporters_on_s3(redacted=True)
print("Scheduling tasks to reindex volumes")
volumes = VolumeMetadata.objects.exclude(out_of_scope=True)
if volume:
volumes = volumes.filter(pk=volume)
if reporter:
volumes = volumes.filter(reporter_id=reporter)
tasks.run_task_for_volumes(
export_cap_static.export_cases_by_volume,
volumes,
last_run_before=last_run_before,
dest_dir=dest_dir,
)


@task
def export_reporters_to_s3_trial():
def summarize_cap_static(dest_dir="/tmp/cap_exports"):
"""
Run export of all reporters and their contents to S3
for first API page.
Second step of the static files export process: add summary files at the reporter level and top level.
"""
convert_s3.put_reporters_on_s3_trial(redacted=True)
export_cap_static.finalize_reporters(dest_dir)


@task
Expand Down
Loading

0 comments on commit e2c436e

Please sign in to comment.