Skip to content

Commit

Permalink
fix #2
Browse files Browse the repository at this point in the history
  • Loading branch information
Karl Krägelin committed Mar 30, 2022
1 parent 41453ea commit b0c78de
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 50 deletions.
30 changes: 17 additions & 13 deletions iiimets.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,14 @@ def loadManifestURLsFromPickle(url: str, cwd: Path, http: requests.session, fnam
'''
Braucht entweder eine IIIF-Collection URL oder eine Liste mit URLs als Pickle Datei
'''
if url is not None:
# URL übergeben
logger.info(f"Getting Newspaper URLs from {url}")
newspaper_urls = getNewspaperManifests(url, http, filter, cwd, logger)

if Path(cwd, fname).exists():
with open(Path(cwd, fname), 'rb') as f:
newspaper_urls = pickle.load(f)
logger.info("Loaded urls from pickled file")
else:
# schauen ob gepicklete liste da ist
if Path(cwd, fname).exists():
with open(Path(cwd, fname), 'rb') as f:
newspaper_urls = pickle.load(f)
logger.info("Loaded urls from pickled file")
else:
logger.error(f"Keine Datei {Path(cwd, fname)} gefunden und keine IIIF-Collection URL übergeben.")
newspaper_urls = []
logger.error(f"Keine Datei {Path(cwd, fname)} gefunden und keine IIIF-Collection URL übergeben.")
newspaper_urls = []

logger.info(f"{len(newspaper_urls)} Newspaper Issues")
return newspaper_urls
Expand Down Expand Up @@ -158,7 +153,16 @@ def start(newspaper_urls: list, cwd: Path, metsfolder: Path, threads: int, cachi

http = setup_requests()
date = time.strftime("%Y-%m-%d")
newspaper_urls = loadManifestURLsFromPickle(url, cwd, http, file, '##', logger)
if file is None and url is None:
sys.exit("You need either an URL to an IIIF Collection or a the Path to a file containg links to IIIF Manifests")
elif file is not None:
if file.endswith('.txt'):
newspaper_urls = [line.rstrip('\n') for line in open(file)]
else:
newspaper_urls = loadManifestURLsFromPickle(url, cwd, http, file, '##', logger)
elif url is not None:
logger.info(f"Getting Newspaper URLs from {url}")
newspaper_urls = getNewspaperManifests(url, http, filter, cwd, logger)

if len(newspaper_urls) == 0:
sys.exit()
Expand Down
3 changes: 3 additions & 0 deletions misc/manifesturls.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
https://api.digitale-sammlungen.de/iiif/presentation/v2/bsb10932837_00155_u001/manifest
https://api.digitale-sammlungen.de/iiif/presentation/v2/bsb10932837_00501_u001/manifest
https://api.digitale-sammlungen.de/iiif/presentation/v2/bsb10932837_00897_u001/manifest
78 changes: 41 additions & 37 deletions src/iiif_harvesting.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
- return list with Manifest URLs
'''

def getIdentifier(url, session):
def getIdentifier(url, session, logger):

# Die Liste nutzen wir nur zum Anzeigen des Fortschritts.
manifests = []
Expand All @@ -34,46 +34,50 @@ def getManifestURLs(response):
d['name'] = i['label']
manifests.append(d)
print(len(manifests))
try:
response = session.get(url,
verify=False,
timeout=(20, 80))
except Exception as e:
logger.error(f'The collection URL is not reachable: {e}')
sys.exit()
else:
print(f'total number of Manifests: {response.json()["total"]}')

response = session.get(url,
verify=False,
timeout=(20, 80))
print(f'total number of Manifests: {response.json()["total"]}')

# Jetzt kommt die eigentliche Schleife: Solange wir da nicht per break rausgehen, läuft die.
# Immer mit neuer URL
while True:
# time.sleep(0.5)
# Verbindungsversuch inkl. Errorhandling
try:
print(url)
response = session.get(url,
verify=False,
timeout=(20, 80))
except Exception as e:
print(f'{url} hat nicht geklappt: {e}')
break
else:
if response.status_code == 404:
if input != "":
sys.exit()
else:
sys.exit()
# Jetzt kommt die eigentliche Schleife: Solange wir da nicht per break rausgehen, läuft die.
# Immer mit neuer URL
while True:
# time.sleep(0.5)
# Verbindungsversuch inkl. Errorhandling
try:
print(url)
response = session.get(url,
verify=False,
timeout=(20, 80))
except Exception as e:
logger.error(f'The collection URL is not reachable: {e}')
break
else:
getManifestURLs(response)
try:
# schauen, ob es noch einen Token gibt oder ob wir aus der Schleife rausmüssen:
url = response.json()["next"]
# if len(manifests) > 300:
# break
except:
# wir sind fertig und gehen per break aus dem while-Loop raus.
print(f"Identifier Harvesting beendet. Insgesamt {len(manifests)} IDs bekommen.")
break
return manifests
if response.status_code == 404:
if input != "":
sys.exit()
else:
sys.exit()
else:
getManifestURLs(response)
try:
# schauen, ob es noch einen Token gibt oder ob wir aus der Schleife rausmüssen:
url = response.json()["next"]
# if len(manifests) > 300:
# break
except:
# wir sind fertig und gehen per break aus dem while-Loop raus.
print(f"Identifier Harvesting beendet. Insgesamt {len(manifests)} IDs bekommen.")
break
return manifests

def getNewspaperManifests(url, session, filter: str, cwd, logger):
manifests = getIdentifier(url, session)
manifests = getIdentifier(url, session, logger)
df = pd.DataFrame.from_records(manifests)
df.to_pickle('allmanifests.pkl')
# newspaper_urls = df.query('name.str.contains("##")', engine="python")['url'].to_list()
Expand Down

0 comments on commit b0c78de

Please sign in to comment.