This repository has been archived by the owner on Nov 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathget_collections.py
78 lines (63 loc) · 2.2 KB
/
get_collections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
# Description: retrieves the collection of each item
# Example usage:
# python get_collections.py ../data/src/pd_items.json ../data/collections.json ../data/item_collections.json
import codecs
from collections import Counter
import json
from pprint import pprint
import re
import sys
# input
if len(sys.argv) < 3:
print "Usage: %s <inputfile items json> <outputfile collections json> <outputfile item collections json>" % sys.argv[0]
sys.exit(1)
INPUT_FILE = sys.argv[1]
OUTPUT_FILE = sys.argv[2]
OUTPUT_ITEMS_FILE = sys.argv[3]
# init
collections = []
item_collections = []
def addCollection(title, uuid):
global collections
global item_collections
collection = next(iter([_c for _c in collections if _c['value']==uuid]), False)
if collection:
collections[collection['index']]['count'] += 1
else:
label = 'Unknown'
url = ''
if title:
label = title
url = 'http://digitalcollections.nypl.org/collections/' + uuid
collection = {
'index': len(collections),
'value': uuid,
'label': label,
'url': url,
'count': 1
}
collections.append(collection)
item_collections.append(collection['index'])
with codecs.open(INPUT_FILE, encoding='utf-8') as infile:
for line in infile:
# Read line as json
item = json.loads(line)
uuid = ""
if "collectionUuid" in item and item["collectionUuid"]:
uuid = item["collectionUuid"].strip()
# Retrieve collection title
title = ""
if "collectionTitle" in item and item["collectionTitle"]:
title = item["collectionTitle"].encode("utf-8").strip()
addCollection(title, uuid)
# Report on collections
collections = sorted(collections, key=lambda d: d['count'], reverse=True)
pprint(collections)
# Write out data
with open(OUTPUT_FILE, 'w') as outfile:
json.dump(collections, outfile)
print "Wrote " + str(len(collections)) + " collections to " + OUTPUT_FILE
with open(OUTPUT_ITEMS_FILE, 'w') as outfile:
json.dump(item_collections, outfile)
print "Wrote " + str(len(item_collections)) + " items to " + OUTPUT_ITEMS_FILE