From 12b0712676bbe584b3ec65235916cf1a8cf68a13 Mon Sep 17 00:00:00 2001 From: kyle Date: Thu, 21 Nov 2024 13:43:32 -0500 Subject: [PATCH 1/3] SFR-2349: Increasing CSV Field Size Limit --- processes/ingest/hathi_trust.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/processes/ingest/hathi_trust.py b/processes/ingest/hathi_trust.py index 9787297f1d..eb960ed9f5 100644 --- a/processes/ingest/hathi_trust.py +++ b/processes/ingest/hathi_trust.py @@ -5,6 +5,7 @@ import os import requests from requests.exceptions import ReadTimeout, HTTPError +import sys from constants.get_constants import get_constants from ..core import CoreProcess @@ -106,6 +107,8 @@ def importFromHathiFile(self, hathi_url, start_date_time=None): self.readHathiFile(hathi_tsv, start_date_time) def readHathiFile(self, hathi_tsv, start_date_time=None): + csv.field_size_limit(sys.maxsize) + for number_of_books_ingested, book in enumerate(hathi_tsv): if self.ingest_limit and number_of_books_ingested > self.ingest_limit: break @@ -114,7 +117,10 @@ def readHathiFile(self, hathi_tsv, start_date_time=None): book_date_updated = (len(book) > 14 and book[14]) or None if book_date_updated: - hathi_date_modified = datetime.strptime(book_date_updated, '%Y-%m-%d %H:%M:%S').replace(tzinfo=None) + try: + hathi_date_modified = datetime.strptime(book_date_updated, '%Y-%m-%d %H:%M:%S').replace(tzinfo=None) + except Exception: + hathi_date_modified = None if book_right and book_right not in self.HATHI_RIGHTS_SKIPS: if not start_date_time or hathi_date_modified >= start_date_time: From 0d4deee042ffbe9e7945e2c3c8f17fc70c71880a Mon Sep 17 00:00:00 2001 From: kyle Date: Thu, 21 Nov 2024 13:50:05 -0500 Subject: [PATCH 2/3] updating field size limit --- processes/ingest/hathi_trust.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/processes/ingest/hathi_trust.py b/processes/ingest/hathi_trust.py index eb960ed9f5..54fcdbbe16 100644 --- a/processes/ingest/hathi_trust.py +++ b/processes/ingest/hathi_trust.py @@ -16,6 +16,7 @@ class HathiTrustProcess(CoreProcess): HATHI_RIGHTS_SKIPS = ['ic', 'icus', 'ic-world', 'und'] + FIELD_SIZE_LIMIT = 131072 * 2 # 131072 is the default size limit def __init__(self, *args): super(HathiTrustProcess, self).__init__(*args[:4], batchSize=1000) @@ -107,7 +108,7 @@ def importFromHathiFile(self, hathi_url, start_date_time=None): self.readHathiFile(hathi_tsv, start_date_time) def readHathiFile(self, hathi_tsv, start_date_time=None): - csv.field_size_limit(sys.maxsize) + csv.field_size_limit(self.FIELD_SIZE_LIMIT) for number_of_books_ingested, book in enumerate(hathi_tsv): if self.ingest_limit and number_of_books_ingested > self.ingest_limit: From 09fdf6c0355860ac0822ad8638654a7a202083a2 Mon Sep 17 00:00:00 2001 From: kyle Date: Thu, 21 Nov 2024 13:50:22 -0500 Subject: [PATCH 3/3] removing import --- processes/ingest/hathi_trust.py | 1 - 1 file changed, 1 deletion(-) diff --git a/processes/ingest/hathi_trust.py b/processes/ingest/hathi_trust.py index 54fcdbbe16..9bc6446538 100644 --- a/processes/ingest/hathi_trust.py +++ b/processes/ingest/hathi_trust.py @@ -5,7 +5,6 @@ import os import requests from requests.exceptions import ReadTimeout, HTTPError -import sys from constants.get_constants import get_constants from ..core import CoreProcess