-
Notifications
You must be signed in to change notification settings - Fork 0
/
goteborgs_gatunamn_get_all_object_attribs.py
executable file
·128 lines (90 loc) · 3.48 KB
/
goteborgs_gatunamn_get_all_object_attribs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
import requests
import xml.etree.ElementTree as ET
import re
from bs4 import BeautifulSoup
import json
import pprint
import urllib.parse
import fileinput
import sys
def get_dl(soup):
keys, values = [], []
for tag in soup.find_all("span",class_="hidden"):
tag.decompose()
for tag in soup.find_all("dl", class_="object_sub_description"):
tag.decompose()
for dl in soup.findAll("dl"):
for dt in dl.findAll("dt"):
keys.append(dt.get_text("\n").strip())
for dd in dl.findAll("dd"):
values.append(dd.get_text("\n\n").strip())
return dict(zip(keys, values))
carlotta_base_url = "https://samlingar.goteborgsstadsmuseum.se"
carlotta_object_path = "/carlotta/web/object"
carlotta_object_url = carlotta_base_url + carlotta_object_path
#objects = [1375460,2111523,1377563,1381895,1379146]
lines = fileinput.input()
objects = [_.strip() for _ in lines]
#objects = [1375462,1375464,1376428]
attribs = {}
for object in objects:
resp = requests.get(carlotta_object_url + "/" + str(object))
print("object:", object, file=sys.stderr)
soup = BeautifulSoup(resp.text, 'html.parser')
keys, values = [], []
for tag in soup.find_all("span",class_="hidden"):
tag.decompose()
for tag in soup.find_all("dl", class_="object_sub_description"):
tag.decompose()
for tag in soup.find_all("dt", class_="object_description"):
tag.parent.decompose()
for dl in soup.findAll("dl"):
for dt in dl.findAll("dt"):
# print(dt)
if dt.get_text() == "\n":
continue
# print(dt)
obj_id = dt.a['href'].split(';')[0].split('/')[-1]
obj_desc = dt.a.get_text()
# print(obj_id, obj_desc)
if obj_id in attribs:
attribs[obj_id]['count'] += 1
else:
attribs[obj_id] = {}
attribs[obj_id]['description'] = obj_desc
attribs[obj_id]['count'] = 1
# print(attribs[obj_id])
print(json.dumps(attribs, sort_keys=True, indent=4, ensure_ascii=False))
# for dt in dl.findAll("dt"):
# keys.append(dt.get_text("\n").strip())
# for dd in dl.findAll("dd"):
# values.append(dd.get_text("\n\n").strip())
# return dict(zip(keys, values))
# dl_dict = get_dl(soup)
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(dl_dict)
# print(json.dumps(dl_dict, sort_keys=True, indent=4, ensure_ascii=False))
# print(dl_dict.keys())
#for status in status_gatunamn:
# resp=requests.get(carlotta_query_url, [('dataelement', status_gatunamn_dataelement), ('value_urlencoded',status), ('op', operation)])
# doc_root=ET.fromstring(resp.text)
#
# # Format is e.g "Ortnamn (6,515)"
# number_of_hits = doc_root.find(".//*[@class='register_filter']").text
# number_of_hits = int(''.join(re.findall(r'[^\D]',number_of_hits)))
#
#
# print(status)
# for n in range(number_of_hits):
#
# resp=requests.get(carlotta_object_browse_url, [('browseEnvironment', carlotta_element_search_path + '?' + '&'.join(['dataelement=' + status_gatunamn_dataelement, 'op=' + operation, 'value=' + status])), ('detailBrowseIndex', n)],allow_redirects=False)
#
# object_id = resp.headers['Location'].split(';')[0].split("/")[-1]
#
# object_ids.add(object_id)
# object_ids.append(object_id)
#
# print(object_id)
#
# print("antal: ",len(object_ids))