From b0c78de9f3bf72d40df63f96661979c55814d829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Wed, 30 Mar 2022 13:06:56 +0000 Subject: [PATCH] fix https://github.com/karkraeg/iiimets/issues/2 --- iiimets.py | 30 +++++++++------- misc/manifesturls.txt | 3 ++ src/iiif_harvesting.py | 78 ++++++++++++++++++++++-------------------- 3 files changed, 61 insertions(+), 50 deletions(-) create mode 100644 misc/manifesturls.txt diff --git a/iiimets.py b/iiimets.py index a9e121c..9f9117a 100644 --- a/iiimets.py +++ b/iiimets.py @@ -50,19 +50,14 @@ def loadManifestURLsFromPickle(url: str, cwd: Path, http: requests.session, fnam ''' Braucht entweder eine IIIF-Collection URL oder eine Liste mit URLs als Pickle Datei ''' - if url is not None: - # URL übergeben - logger.info(f"Getting Newspaper URLs from {url}") - newspaper_urls = getNewspaperManifests(url, http, filter, cwd, logger) + + if Path(cwd, fname).exists(): + with open(Path(cwd, fname), 'rb') as f: + newspaper_urls = pickle.load(f) + logger.info("Loaded urls from pickled file") else: - # schauen ob gepicklete liste da ist - if Path(cwd, fname).exists(): - with open(Path(cwd, fname), 'rb') as f: - newspaper_urls = pickle.load(f) - logger.info("Loaded urls from pickled file") - else: - logger.error(f"Keine Datei {Path(cwd, fname)} gefunden und keine IIIF-Collection URL übergeben.") - newspaper_urls = [] + logger.error(f"Keine Datei {Path(cwd, fname)} gefunden und keine IIIF-Collection URL übergeben.") + newspaper_urls = [] logger.info(f"{len(newspaper_urls)} Newspaper Issues") return newspaper_urls @@ -158,7 +153,16 @@ def start(newspaper_urls: list, cwd: Path, metsfolder: Path, threads: int, cachi http = setup_requests() date = time.strftime("%Y-%m-%d") - newspaper_urls = loadManifestURLsFromPickle(url, cwd, http, file, '##', logger) + if file is None and url is None: + sys.exit("You need either an URL to an IIIF Collection or a the Path to a file containg links to IIIF Manifests") + elif file is not None: + if file.endswith('.txt'): + newspaper_urls = [line.rstrip('\n') for line in open(file)] + else: + newspaper_urls = loadManifestURLsFromPickle(url, cwd, http, file, '##', logger) + elif url is not None: + logger.info(f"Getting Newspaper URLs from {url}") + newspaper_urls = getNewspaperManifests(url, http, filter, cwd, logger) if len(newspaper_urls) == 0: sys.exit() diff --git a/misc/manifesturls.txt b/misc/manifesturls.txt new file mode 100644 index 0000000..3215e70 --- /dev/null +++ b/misc/manifesturls.txt @@ -0,0 +1,3 @@ +https://api.digitale-sammlungen.de/iiif/presentation/v2/bsb10932837_00155_u001/manifest +https://api.digitale-sammlungen.de/iiif/presentation/v2/bsb10932837_00501_u001/manifest +https://api.digitale-sammlungen.de/iiif/presentation/v2/bsb10932837_00897_u001/manifest \ No newline at end of file diff --git a/src/iiif_harvesting.py b/src/iiif_harvesting.py index 1eb20b7..7b884c3 100644 --- a/src/iiif_harvesting.py +++ b/src/iiif_harvesting.py @@ -20,7 +20,7 @@ - return list with Manifest URLs ''' -def getIdentifier(url, session): +def getIdentifier(url, session, logger): # Die Liste nutzen wir nur zum Anzeigen des Fortschritts. manifests = [] @@ -34,46 +34,50 @@ def getManifestURLs(response): d['name'] = i['label'] manifests.append(d) print(len(manifests)) + try: + response = session.get(url, + verify=False, + timeout=(20, 80)) + except Exception as e: + logger.error(f'The collection URL is not reachable: {e}') + sys.exit() + else: + print(f'total number of Manifests: {response.json()["total"]}') - response = session.get(url, - verify=False, - timeout=(20, 80)) - print(f'total number of Manifests: {response.json()["total"]}') - - # Jetzt kommt die eigentliche Schleife: Solange wir da nicht per break rausgehen, läuft die. - # Immer mit neuer URL - while True: - # time.sleep(0.5) - # Verbindungsversuch inkl. Errorhandling - try: - print(url) - response = session.get(url, - verify=False, - timeout=(20, 80)) - except Exception as e: - print(f'{url} hat nicht geklappt: {e}') - break - else: - if response.status_code == 404: - if input != "": - sys.exit() - else: - sys.exit() + # Jetzt kommt die eigentliche Schleife: Solange wir da nicht per break rausgehen, läuft die. + # Immer mit neuer URL + while True: + # time.sleep(0.5) + # Verbindungsversuch inkl. Errorhandling + try: + print(url) + response = session.get(url, + verify=False, + timeout=(20, 80)) + except Exception as e: + logger.error(f'The collection URL is not reachable: {e}') + break else: - getManifestURLs(response) - try: - # schauen, ob es noch einen Token gibt oder ob wir aus der Schleife rausmüssen: - url = response.json()["next"] - # if len(manifests) > 300: - # break - except: - # wir sind fertig und gehen per break aus dem while-Loop raus. - print(f"Identifier Harvesting beendet. Insgesamt {len(manifests)} IDs bekommen.") - break - return manifests + if response.status_code == 404: + if input != "": + sys.exit() + else: + sys.exit() + else: + getManifestURLs(response) + try: + # schauen, ob es noch einen Token gibt oder ob wir aus der Schleife rausmüssen: + url = response.json()["next"] + # if len(manifests) > 300: + # break + except: + # wir sind fertig und gehen per break aus dem while-Loop raus. + print(f"Identifier Harvesting beendet. Insgesamt {len(manifests)} IDs bekommen.") + break + return manifests def getNewspaperManifests(url, session, filter: str, cwd, logger): - manifests = getIdentifier(url, session) + manifests = getIdentifier(url, session, logger) df = pd.DataFrame.from_records(manifests) df.to_pickle('allmanifests.pkl') # newspaper_urls = df.query('name.str.contains("##")', engine="python")['url'].to_list()