From d9e5988d2100efdd86bf24a132401c6996532805 Mon Sep 17 00:00:00 2001 From: Michael Corey Date: Fri, 1 Nov 2024 09:58:39 -0500 Subject: [PATCH] Always pagination fun. Fixes and test fixes --- apps/deed/fixtures/deed.yaml | 77 +++++++++++++++++++ .../management/commands/gather_image_hits.py | 2 +- .../commands/run_term_search_test.py | 9 ++- apps/deed/models.py | 1 + apps/deed/tests.py | 42 +++++++++- apps/deed/utils/deed_pagination.py | 17 +++- apps/zoon/fixtures/zoon.yaml | 2 +- 7 files changed, 140 insertions(+), 10 deletions(-) diff --git a/apps/deed/fixtures/deed.yaml b/apps/deed/fixtures/deed.yaml index 6e2b9d3..48e4d9b 100644 --- a/apps/deed/fixtures/deed.yaml +++ b/apps/deed/fixtures/deed.yaml @@ -1033,3 +1033,80 @@ bool_exception: false matched_terms: [] doc_page_count: null +# Forsyth County, NC pagination +- model: deed.deedpage + pk: 42 + fields: + workflow: 1 + s3_lookup: 'deedhold/t/0833/08330290.002' + doc_num: '' + # doc_alt_id: '2475325' + # batch_id: '1900 A DEED BL-Z' + book_id: '0833' + page_num: 290 + split_page_num: null + doc_date: null + doc_type: 'deedhold' + public_uuid: 'XXXX' + page_image_web: 'web/fake/deedhold/t/0833/08330290.002.jpg' + prev_page_image_web: '' + next_page_image_web: '' + next_next_page_image_web: '' + page_stats: '' + page_ocr_text: '' + page_ocr_json: '' + bool_match: true + bool_exception: false + matched_terms: [] + doc_page_count: null +- model: deed.deedpage + pk: 43 + fields: + workflow: 1 + s3_lookup: 'deedhold/t/0833/08330291.002' + doc_num: '' + # doc_alt_id: '2475325' + # batch_id: '1900 A DEED BL-Z' + book_id: '0833' + page_num: 291 + split_page_num: null + doc_date: null + doc_type: 'deedhold' + public_uuid: 'XXXX' + page_image_web: 'web/fake/deedhold/t/0833/08330291.002.jpg' + prev_page_image_web: '' + next_page_image_web: '' + next_next_page_image_web: '' + page_stats: '' + page_ocr_text: '' + page_ocr_json: '' + bool_match: true + bool_exception: false + matched_terms: [] + doc_page_count: null +# Same book/page/doc_num but different doc_type than previous +- model: deed.deedpage + pk: 44 + fields: + workflow: 1 + s3_lookup: 'otherdoctype/t/0833/08330291.002' + doc_num: '' + # doc_alt_id: '2475325' + # batch_id: '1900 A DEED BL-Z' + book_id: '0833' + page_num: 291 + split_page_num: null + doc_date: null + doc_type: 'otherdoctype' + public_uuid: 'XXXX' + page_image_web: 'web/fake/otherdoctype/t/0833/08330291.002.jpg' + prev_page_image_web: '' + next_page_image_web: '' + next_next_page_image_web: '' + page_stats: '' + page_ocr_text: '' + page_ocr_json: '' + bool_match: true + bool_exception: false + matched_terms: [] + doc_page_count: null \ No newline at end of file diff --git a/apps/deed/management/commands/gather_image_hits.py b/apps/deed/management/commands/gather_image_hits.py index b868181..3edf578 100644 --- a/apps/deed/management/commands/gather_image_hits.py +++ b/apps/deed/management/commands/gather_image_hits.py @@ -92,7 +92,7 @@ def build_match_report(self, workflow, matching_keys): # non-racial terms, for example as requested by CC County. If this is only term found, set as exception so it can be exported separately nonracial_terms = ['disorderly persons', 'less than 18 years', 'no children', 'no minor', 'occupy said real property', 'poverty', 'under the age of', 'years of age or older'] - for term in workflow_special_terms: + for term in nonracial_terms: if term in report_df.columns: report_df.loc[~report_df[term].isna(), 'nonracial_term_count'] = report_df[term].apply(lambda x: self.split_or_1(x)) else: diff --git a/apps/deed/management/commands/run_term_search_test.py b/apps/deed/management/commands/run_term_search_test.py index 3de4ba9..b00f228 100644 --- a/apps/deed/management/commands/run_term_search_test.py +++ b/apps/deed/management/commands/run_term_search_test.py @@ -126,10 +126,11 @@ def trigger_lambda(self, deedpage_obj): "body": { "message": "Term search test", "bucket": settings.AWS_STORAGE_BUCKET_NAME, - "orig": 'Test key', - "json": deedpage_obj['page_ocr_json'], - "txt": deedpage_obj['page_ocr_text'], - "stats": deedpage_obj['page_stats'], + "orig_img": 'Test key', + "ocr_json": deedpage_obj['page_ocr_json'], + # "txt": deedpage_obj['page_ocr_text'], + # "stats": deedpage_obj['page_stats'], + "web_img": deedpage_obj['page_image_web'], "uuid": deedpage_obj['public_uuid'], "handwriting_pct": '0' }, diff --git a/apps/deed/models.py b/apps/deed/models.py index 0ca32ed..35f0f52 100644 --- a/apps/deed/models.py +++ b/apps/deed/models.py @@ -145,6 +145,7 @@ def next_thumbnail_preview(self): def deedpage_offset_finder(self, offset): kwargs = { 'workflow': self.workflow, + 'doc_type': self.doc_type, 'batch_id': self.batch_id, 'book_id': self.book_id, diff --git a/apps/deed/tests.py b/apps/deed/tests.py index b138983..011c11f 100644 --- a/apps/deed/tests.py +++ b/apps/deed/tests.py @@ -468,4 +468,44 @@ def test_prev_next_wash_split_page(self): self.assertEqual(deed_page_2.prev_deedpage, deed_page_1) self.assertEqual(deed_page_2.next_deedpage, None) - self.assertEqual(deed_page_2.next_next_deedpage, None) \ No newline at end of file + self.assertEqual(deed_page_2.next_next_deedpage, None) + + # Forsyth County examples + def test_prev_next_wash_split_page(self): + """Does deedpage find correct prev/next images and deedpage records?? + """ + + deed_page_1 = DeedPage.objects.get( + s3_lookup='deedhold/t/0833/08330290.002' + ) + + deed_page_2 = DeedPage.objects.get( + s3_lookup='deedhold/t/0833/08330291.002' + ) + + # Should not really match with anything + deed_page_3 = DeedPage.objects.get( + s3_lookup='otherdoctype/t/0833/08330291.002' + ) + + self.assertEqual(deed_page_1.prev_page_image_web.__str__(), '') + self.assertEqual(deed_page_1.next_page_image_web.__str__(), 'web/fake/deedhold/t/0833/08330291.002.jpg') + self.assertEqual(deed_page_1.next_next_page_image_web.__str__(), '') + + self.assertEqual(deed_page_2.prev_page_image_web.__str__(), 'web/fake/deedhold/t/0833/08330290.002.jpg') + self.assertEqual(deed_page_2.next_page_image_web.__str__(), '') + self.assertEqual(deed_page_2.next_next_page_image_web.__str__(), '') + + self.assertEqual(deed_page_1.prev_deedpage, None) + self.assertEqual(deed_page_1.next_deedpage, deed_page_2) + self.assertEqual(deed_page_1.next_next_deedpage, None) + + self.assertEqual(deed_page_2.prev_deedpage, deed_page_1) + self.assertEqual(deed_page_2.next_deedpage, None) + self.assertEqual(deed_page_2.next_next_deedpage, None) + + # Make sure a doc with same book and page but different doctype doesn't get matched + self.assertEqual(deed_page_3.prev_deedpage, None) + self.assertEqual(deed_page_1.doc_page_count, 1) + self.assertEqual(deed_page_2.doc_page_count, 1) + self.assertEqual(deed_page_3.doc_page_count, 1) diff --git a/apps/deed/utils/deed_pagination.py b/apps/deed/utils/deed_pagination.py index 02cdc5f..b8a9b58 100644 --- a/apps/deed/utils/deed_pagination.py +++ b/apps/deed/utils/deed_pagination.py @@ -45,14 +45,15 @@ def pagination_merge(match_df, doc_list_df, doc_or_book_selector='doc_num', offs match_df = match_df.merge( doc_list_df[[ + 'doc_type', doc_or_book_selector, f'{split_str}page_num_right', new_image_field, new_image_lookup_field ]], how="left", - left_on=[doc_or_book_selector, f"{split_str}page_num_{offset}"], - right_on=[doc_or_book_selector, f"{split_str}page_num_right"] + left_on=["doc_type", doc_or_book_selector, f"{split_str}page_num_{offset}"], + right_on=["doc_type", doc_or_book_selector, f"{split_str}page_num_right"] ).drop(columns=[f"{split_str}page_num_right"]).drop_duplicates(subset=['s3_lookup']) return match_df @@ -82,6 +83,8 @@ def paginate_deedpage_df(df, matches_only=False): df["doc_type"] = '' # If doc_num is null, use doc_type/book/page as doc_num + if "doc_num" not in df.columns: + df["doc_num"] = '' df['doc_num'] = df['doc_num'].str.replace('NONE', '') df['doc_num'] = df['doc_num'].fillna('') if 'book_id' in df.columns: @@ -93,6 +96,10 @@ def paginate_deedpage_df(df, matches_only=False): if "book_id" not in df.columns: df["book_id"] = '' + # Drop duplicates for non-unique doc_num, book_id, page_num, split_page combos + print("Dropping duplicate doc/page/split page combos...") + df = df.drop_duplicates(subset=['doc_type', 'doc_num', 'book_id', 'page_num', 'split_page_num']) + # Tag docs with page count by doc_num print('Tagging doc num page counts...') df = tag_doc_num_page_counts(df) @@ -107,6 +114,7 @@ def paginate_deedpage_df(df, matches_only=False): doc_list_df = df[[ # 'pk', 's3_lookup', + 'doc_type', 'doc_num', 'book_id', 'page_num', @@ -128,7 +136,7 @@ def paginate_deedpage_df(df, matches_only=False): split_page=True ) - validation_fields = ['s3_lookup', 'doc_num', 'book_id', 'page_num', 'split_page_num', 'doc_page_count'] + validation_fields = ['s3_lookup', 'doc_type', 'doc_num', 'book_id', 'page_num', 'split_page_num', 'doc_page_count'] page_to_find = None # page_to_find = 'OlmstedCountyAbstracts/OldDeedBooks/D-102/HDEED102192' @@ -291,6 +299,7 @@ def tag_prev_next_image_sql(workflow, matches_only=False): ).values( # 'pk', 'bool_match', + 'doc_type', 'doc_num', 'public_uuid', 'book_id', @@ -316,6 +325,7 @@ def tag_prev_next_image_sql(workflow, matches_only=False): update_df = pd.DataFrame(objs_to_update).merge( match_df[[ 's3_lookup', + 'doc_page_count', 'prev_page_image_web', 'next_page_image_web', 'next_next_page_image_web', @@ -333,6 +343,7 @@ def tag_prev_next_image_sql(workflow, matches_only=False): DeedPage.objects.bulk_update( dp_objs, [ + 'doc_page_count', 'prev_page_image_web', 'next_page_image_web', 'next_next_page_image_web', diff --git a/apps/zoon/fixtures/zoon.yaml b/apps/zoon/fixtures/zoon.yaml index 57afff8..f90a4e2 100644 --- a/apps/zoon/fixtures/zoon.yaml +++ b/apps/zoon/fixtures/zoon.yaml @@ -2,7 +2,7 @@ pk: 1 fields: zoon_id: 13143 - workflow_name: Ramsey County + workflow_name: MN Test County version: 4.1 - model: zoon.zooniversesubject