Skip to content

Commit

Permalink
Always pagination fun. Fixes and test fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mikejcorey committed Nov 1, 2024
1 parent a818aa7 commit d9e5988
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 10 deletions.
77 changes: 77 additions & 0 deletions apps/deed/fixtures/deed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1033,3 +1033,80 @@
bool_exception: false
matched_terms: []
doc_page_count: null
# Forsyth County, NC pagination
- model: deed.deedpage
pk: 42
fields:
workflow: 1
s3_lookup: 'deedhold/t/0833/08330290.002'
doc_num: ''
# doc_alt_id: '2475325'
# batch_id: '1900 A DEED BL-Z'
book_id: '0833'
page_num: 290
split_page_num: null
doc_date: null
doc_type: 'deedhold'
public_uuid: 'XXXX'
page_image_web: 'web/fake/deedhold/t/0833/08330290.002.jpg'
prev_page_image_web: ''
next_page_image_web: ''
next_next_page_image_web: ''
page_stats: ''
page_ocr_text: ''
page_ocr_json: ''
bool_match: true
bool_exception: false
matched_terms: []
doc_page_count: null
- model: deed.deedpage
pk: 43
fields:
workflow: 1
s3_lookup: 'deedhold/t/0833/08330291.002'
doc_num: ''
# doc_alt_id: '2475325'
# batch_id: '1900 A DEED BL-Z'
book_id: '0833'
page_num: 291
split_page_num: null
doc_date: null
doc_type: 'deedhold'
public_uuid: 'XXXX'
page_image_web: 'web/fake/deedhold/t/0833/08330291.002.jpg'
prev_page_image_web: ''
next_page_image_web: ''
next_next_page_image_web: ''
page_stats: ''
page_ocr_text: ''
page_ocr_json: ''
bool_match: true
bool_exception: false
matched_terms: []
doc_page_count: null
# Same book/page/doc_num but different doc_type than previous
- model: deed.deedpage
pk: 44
fields:
workflow: 1
s3_lookup: 'otherdoctype/t/0833/08330291.002'
doc_num: ''
# doc_alt_id: '2475325'
# batch_id: '1900 A DEED BL-Z'
book_id: '0833'
page_num: 291
split_page_num: null
doc_date: null
doc_type: 'otherdoctype'
public_uuid: 'XXXX'
page_image_web: 'web/fake/otherdoctype/t/0833/08330291.002.jpg'
prev_page_image_web: ''
next_page_image_web: ''
next_next_page_image_web: ''
page_stats: ''
page_ocr_text: ''
page_ocr_json: ''
bool_match: true
bool_exception: false
matched_terms: []
doc_page_count: null
2 changes: 1 addition & 1 deletion apps/deed/management/commands/gather_image_hits.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def build_match_report(self, workflow, matching_keys):
# non-racial terms, for example as requested by CC County. If this is only term found, set as exception so it can be exported separately
nonracial_terms = ['disorderly persons', 'less than 18 years', 'no children', 'no minor', 'occupy said real property', 'poverty', 'under the age of', 'years of age or older']

for term in workflow_special_terms:
for term in nonracial_terms:
if term in report_df.columns:
report_df.loc[~report_df[term].isna(), 'nonracial_term_count'] = report_df[term].apply(lambda x: self.split_or_1(x))
else:
Expand Down
9 changes: 5 additions & 4 deletions apps/deed/management/commands/run_term_search_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,11 @@ def trigger_lambda(self, deedpage_obj):
"body": {
"message": "Term search test",
"bucket": settings.AWS_STORAGE_BUCKET_NAME,
"orig": 'Test key',
"json": deedpage_obj['page_ocr_json'],
"txt": deedpage_obj['page_ocr_text'],
"stats": deedpage_obj['page_stats'],
"orig_img": 'Test key',
"ocr_json": deedpage_obj['page_ocr_json'],
# "txt": deedpage_obj['page_ocr_text'],
# "stats": deedpage_obj['page_stats'],
"web_img": deedpage_obj['page_image_web'],
"uuid": deedpage_obj['public_uuid'],
"handwriting_pct": '0'
},
Expand Down
1 change: 1 addition & 0 deletions apps/deed/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def next_thumbnail_preview(self):
def deedpage_offset_finder(self, offset):
kwargs = {
'workflow': self.workflow,
'doc_type': self.doc_type,
'batch_id': self.batch_id,
'book_id': self.book_id,

Expand Down
42 changes: 41 additions & 1 deletion apps/deed/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,4 +468,44 @@ def test_prev_next_wash_split_page(self):

self.assertEqual(deed_page_2.prev_deedpage, deed_page_1)
self.assertEqual(deed_page_2.next_deedpage, None)
self.assertEqual(deed_page_2.next_next_deedpage, None)
self.assertEqual(deed_page_2.next_next_deedpage, None)

# Forsyth County examples
def test_prev_next_wash_split_page(self):
"""Does deedpage find correct prev/next images and deedpage records??
"""

deed_page_1 = DeedPage.objects.get(
s3_lookup='deedhold/t/0833/08330290.002'
)

deed_page_2 = DeedPage.objects.get(
s3_lookup='deedhold/t/0833/08330291.002'
)

# Should not really match with anything
deed_page_3 = DeedPage.objects.get(
s3_lookup='otherdoctype/t/0833/08330291.002'
)

self.assertEqual(deed_page_1.prev_page_image_web.__str__(), '')
self.assertEqual(deed_page_1.next_page_image_web.__str__(), 'web/fake/deedhold/t/0833/08330291.002.jpg')
self.assertEqual(deed_page_1.next_next_page_image_web.__str__(), '')

self.assertEqual(deed_page_2.prev_page_image_web.__str__(), 'web/fake/deedhold/t/0833/08330290.002.jpg')
self.assertEqual(deed_page_2.next_page_image_web.__str__(), '')
self.assertEqual(deed_page_2.next_next_page_image_web.__str__(), '')

self.assertEqual(deed_page_1.prev_deedpage, None)
self.assertEqual(deed_page_1.next_deedpage, deed_page_2)
self.assertEqual(deed_page_1.next_next_deedpage, None)

self.assertEqual(deed_page_2.prev_deedpage, deed_page_1)
self.assertEqual(deed_page_2.next_deedpage, None)
self.assertEqual(deed_page_2.next_next_deedpage, None)

# Make sure a doc with same book and page but different doctype doesn't get matched
self.assertEqual(deed_page_3.prev_deedpage, None)
self.assertEqual(deed_page_1.doc_page_count, 1)
self.assertEqual(deed_page_2.doc_page_count, 1)
self.assertEqual(deed_page_3.doc_page_count, 1)
17 changes: 14 additions & 3 deletions apps/deed/utils/deed_pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@ def pagination_merge(match_df, doc_list_df, doc_or_book_selector='doc_num', offs

match_df = match_df.merge(
doc_list_df[[
'doc_type',
doc_or_book_selector,
f'{split_str}page_num_right',
new_image_field,
new_image_lookup_field
]],
how="left",
left_on=[doc_or_book_selector, f"{split_str}page_num_{offset}"],
right_on=[doc_or_book_selector, f"{split_str}page_num_right"]
left_on=["doc_type", doc_or_book_selector, f"{split_str}page_num_{offset}"],
right_on=["doc_type", doc_or_book_selector, f"{split_str}page_num_right"]
).drop(columns=[f"{split_str}page_num_right"]).drop_duplicates(subset=['s3_lookup'])

return match_df
Expand Down Expand Up @@ -82,6 +83,8 @@ def paginate_deedpage_df(df, matches_only=False):
df["doc_type"] = ''

# If doc_num is null, use doc_type/book/page as doc_num
if "doc_num" not in df.columns:
df["doc_num"] = ''
df['doc_num'] = df['doc_num'].str.replace('NONE', '')
df['doc_num'] = df['doc_num'].fillna('')
if 'book_id' in df.columns:
Expand All @@ -93,6 +96,10 @@ def paginate_deedpage_df(df, matches_only=False):
if "book_id" not in df.columns:
df["book_id"] = ''

# Drop duplicates for non-unique doc_num, book_id, page_num, split_page combos
print("Dropping duplicate doc/page/split page combos...")
df = df.drop_duplicates(subset=['doc_type', 'doc_num', 'book_id', 'page_num', 'split_page_num'])

# Tag docs with page count by doc_num
print('Tagging doc num page counts...')
df = tag_doc_num_page_counts(df)
Expand All @@ -107,6 +114,7 @@ def paginate_deedpage_df(df, matches_only=False):
doc_list_df = df[[
# 'pk',
's3_lookup',
'doc_type',
'doc_num',
'book_id',
'page_num',
Expand All @@ -128,7 +136,7 @@ def paginate_deedpage_df(df, matches_only=False):
split_page=True
)

validation_fields = ['s3_lookup', 'doc_num', 'book_id', 'page_num', 'split_page_num', 'doc_page_count']
validation_fields = ['s3_lookup', 'doc_type', 'doc_num', 'book_id', 'page_num', 'split_page_num', 'doc_page_count']
page_to_find = None
# page_to_find = 'OlmstedCountyAbstracts/OldDeedBooks/D-102/HDEED102192'

Expand Down Expand Up @@ -291,6 +299,7 @@ def tag_prev_next_image_sql(workflow, matches_only=False):
).values(
# 'pk',
'bool_match',
'doc_type',
'doc_num',
'public_uuid',
'book_id',
Expand All @@ -316,6 +325,7 @@ def tag_prev_next_image_sql(workflow, matches_only=False):
update_df = pd.DataFrame(objs_to_update).merge(
match_df[[
's3_lookup',
'doc_page_count',
'prev_page_image_web',
'next_page_image_web',
'next_next_page_image_web',
Expand All @@ -333,6 +343,7 @@ def tag_prev_next_image_sql(workflow, matches_only=False):
DeedPage.objects.bulk_update(
dp_objs,
[
'doc_page_count',
'prev_page_image_web',
'next_page_image_web',
'next_next_page_image_web',
Expand Down
2 changes: 1 addition & 1 deletion apps/zoon/fixtures/zoon.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
pk: 1
fields:
zoon_id: 13143
workflow_name: Ramsey County
workflow_name: MN Test County
version: 4.1

- model: zoon.zooniversesubject
Expand Down

0 comments on commit d9e5988

Please sign in to comment.