SFR-2308_PubBacklistMapping (#460)

* SFR-2308_PubBacklistMapping * Fixed method name * Fixed failed test * Fixed failed test * Added source_id to createMapping and retrieved source name from Airtable * Changed some variables to snake case --------- Co-authored-by: Dmitri <[email protected]>
NYPL · Nov 26, 2024 · 6efb5b6 · 6efb5b6
1 parent f97ca6f
commit 6efb5b6
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 19 deletions.
diff --git a/mappings/UofM.py b/mappings/UofM.py
@@ -83,12 +83,6 @@ def formatSubjects(self):
             return subjectList
 
     def formatRights(self):
-        '''
-        The pipe delimiter is to separate the Rights table attributes into this format:
-        source|license|reason|statement|date 
-        which makes it easy to place the right data into the columns when clustered
-        '''
-
         if not self.record.rights: 
             return None
 
@@ -102,9 +96,3 @@ def formatRights(self):
             return 'UofM|{}||{}|'.format('public_domain', 'Public Domain') 
 
         return None
-
-
-
-
-
-
diff --git a/mappings/publisher_backlist.py b/mappings/publisher_backlist.py
@@ -0,0 +1,96 @@
+from .json import JSONMapping
+
+class PublisherBacklistMapping(JSONMapping):
+    def __init__(self, source):
+        super().__init__(source, {})
+        self.mapping = self.createMapping()
+
+    def createMapping(self):
+        return {
+            'title': ('Title', '{0}'),
+            'authors': ('Author(s)', '{0}'),
+            'dates': [('Pub Date', '{0}|publication_date')],
+            'publisher': [('Publisher (from Projects)', '{0}||')],
+            'identifiers': [
+                ('ISBN', '{0}|isbn'),
+                ('OCLC', '{0}|oclc')
+            ],
+            'rights': ('DRB Rights Classification', '{0}||||'),
+            'contributors': [('Contributors', '{0}|||contributor')],
+            'subjects': ('Subject 1', '{0}'),
+            'source': ('Project Name (from Projects)', '{0}'),
+            'source_id': ('DRB Record_ID', '{0}'),
+            'publisher_project_source': ('Publisher (from Projects)', '{0}')
+        }
+
+    def applyFormatting(self):
+        self.record.has_part = []
+        if self.record.source:
+            source_list = self.record.source[0].split(' ')
+            self.record.source = source_list[0]
+
+        if self.record.publisher_project_source:
+            publisher_source = self.record.publisher_project_source[0]
+            self.record.publisher_project_source = publisher_source
+
+        if self.record.authors:
+            self.record.authors = self.format_authors()
+
+        if self.record.subjects:
+            self.record.subjects = self.format_subjects()
+
+        if self.record.identifiers:
+            self.record.identifiers = self.format_identifiers()
+
+        self.record.rights = self.format_rights()
+
+    def format_authors(self):
+        author_list = []
+
+        if ';' in self.record.authors:
+            author_list = self.record.authors.split('; ')
+            new_author_list = [f'{author}|||true' for author in author_list] 
+            return new_author_list
+        else:
+            author_list.append(f'{self.record.authors}|||true)')
+            return author_list
+
+
+    def format_identifiers(self):
+        if 'isbn' in self.record.identifiers[0]:
+            isbn_string = self.record.identifiers[0].split('|')[0]
+            if ';' in isbn_string:
+                isbns = isbn_string.split('; ')
+                formatted_isbns = [f'{isbn}|isbn' for isbn in isbns]
+                if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]:
+                    formatted_isbns.append(f'{self.record.identifiers[1]}')
+                    return formatted_isbns
+                else:
+                    return formatted_isbns
+
+        return self.record.identifiers
+
+    def format_subjects(self):
+        subject_list = []
+
+        if '|' in self.record.subjects:
+            subject_list = self.record.subjects.split('|')
+            return [f'{subject}||' for subject in subject_list]
+        else:
+            subject_list.append(f'{self.record.subjects}||')
+            return subject_list
+
+    def format_rights(self):
+        if not self.record.rights: 
+            return None
+
+        rights_elements = self.record.rights.split('|')
+        rights_status = rights_elements[0]
+
+        if rights_status == 'in copyright':
+            return '{}|{}||{}|'.format('self.record.source', 'in_copyright', 'In Copyright') 
+
+        if rights_status == 'public domain':
+            return '{}|{}||{}|'.format('self.record.source', 'public_domain', 'Public Domain') 
+
+        return None
diff --git a/processes/ingest/publisher_backlist.py b/processes/ingest/publisher_backlist.py
@@ -33,7 +33,6 @@ def runProcess(self):
             else: 
                 logger.warning(f'Unknown Publisher Backlist ingestion process type {self.process}')
                 return
-
             for record in records:
                 self.addDCDWToUpdateList(record)
 

diff --git a/services/sources/publisher_backlist_service.py b/services/sources/publisher_backlist_service.py
@@ -3,10 +3,10 @@
 import requests
 import json
 import urllib.parse
-from typing import Optional, Dict
+from typing import Optional
 
 from logger import create_log
-from mappings.UofM import UofMMapping
+from mappings.publisher_backlist import PublisherBacklistMapping
 from .source_service import SourceService
 
 logger = create_log(__name__)
@@ -23,14 +23,14 @@ def get_records(
         start_timestamp: datetime=None,
         offset: Optional[int]=None,
         limit: Optional[int]=None
-    ) -> list[UofMMapping]:
+    ) -> list[PublisherBacklistMapping]:
         array_json_records = self.get_records_json(full_import, start_timestamp, offset, limit)
 
         for json_dict in array_json_records:
             for records_value in json_dict['records']:
                 try:
-                    record_metadata_dict = records_value
-                    record = UofMMapping(record_metadata_dict)
+                    record_metadata_dict = records_value['fields']
+                    record = PublisherBacklistMapping(record_metadata_dict)
                     record.applyMapping()
                 except Exception:
                     logger.exception(f'Failed to process Publisher Backlist record')
@@ -89,7 +89,6 @@ def get_records_array(self,
         pub_backlist_records_response = requests.get(url, headers=headers)
         pub_backlist_records_response_json = pub_backlist_records_response.json()
         array_json = [pub_backlist_records_response_json]
-
         while 'offset' in pub_backlist_records_response_json:
             next_page_url = url + f"&offset={pub_backlist_records_response_json['offset']}"
             pub_backlist_records_response = requests.get(next_page_url, headers=headers)

diff --git a/tests/unit/test_pub_backlist_mapping.py b/tests/unit/test_pub_backlist_mapping.py
@@ -0,0 +1,51 @@
+import pytest
+
+from mappings.publisher_backlist import PublisherBacklistMapping
+
+class TestPublisherBacklistMapping:
+    @pytest.fixture
+    def test_mapping(self):
+        class TestPublisherBacklistMapping(PublisherBacklistMapping):
+            def __init__(self):
+                self.mapping = None
+
+        return TestPublisherBacklistMapping()
+
+    @pytest.fixture
+    def testRecordStandard(self, mocker):
+        return mocker.MagicMock(
+            title='testTitle',
+            authors=['testAuthor|||true'],
+            dates=['testDate|publication_date'],
+            publisher=['testPublisher||'],
+            identifiers=['testISBN|isbn', 'testOCLC|oclc'],
+            rights='in copyright||||',
+            contributor=['testContributor|||contributor'],
+            subjects='testSubject',
+            source=['UofMichigan Backlist'],
+            source_id='testSourceID',
+            publisher_project_source=['University of Michigan Press']
+        )
+
+    def test_createMapping(self, test_mapping):
+        record_mapping = test_mapping.createMapping()
+
+        assert list(record_mapping.keys()) == [
+            'title', 'authors', 'dates', 'publisher',
+            'identifiers', 'rights', 'contributors', 'subjects',
+            'source', 'source_id', 'publisher_project_source'
+        ]
+        assert record_mapping['title'] == ('Title', '{0}')
+
+    def test_applyFormatting_standard(self, test_mapping, testRecordStandard):
+        test_mapping.record = testRecordStandard
+
+        test_mapping.applyFormatting()
+
+        assert test_mapping.record.has_part == []
+        assert test_mapping.record.source == 'UofMichigan'
+        assert test_mapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc']
+        assert test_mapping.record.source_id == 'testSourceID'
+        assert test_mapping.record.publisher == ['testPublisher||']
+        assert test_mapping.record.publisher_project_source == 'University of Michigan Press'
+        assert test_mapping.record.subjects == ['testSubject||']
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,7 +33,6 @@ def runProcess(self): @@
                 else:
                     logger.warning(f'Unknown Publisher Backlist ingestion process type {self.process}')
                     return
                 for record in records:
                     self.addDCDWToUpdateList(record)
@@ Expand Down @@