-
Notifications
You must be signed in to change notification settings - Fork 0
/
smart_merge.py
121 lines (90 loc) · 3.55 KB
/
smart_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from utils import read_json, write_json, smart_ratio, get_names, get_all_names
from manual_cleaning import clean
SOURCE_NAMES = {
"esv": "Ekonomistyrningsverket",
"stkt": "Statskontoret",
"scb": "SCB",
"sfs": "Statens författningssamling",
"wd": "Wikidata",
"agv": "Arbetsgivarverket",
"handlingar": "Handlingar.se",
}
data = {}
for source in SOURCE_NAMES:
print(f"Loading data from {SOURCE_NAMES[source]}...")
data[source] = read_json(f"data/{source}.json")
def has_org_nr(source):
for agency in source:
if "org_nr" in source[agency]:
return True
def get(merged_data, key):
values = {}
for source in merged_data:
data = merged_data[source]
if key in data:
values[source] = data[key]
return values
def find_match(merged_data, key, value):
for agency in merged_data:
for source in merged_data[agency]:
data = merged_data[agency][source]
if key in data and data[key] == value:
return agency
return None
def is_court(org_nr, agency):
return org_nr == "202100-2742" and agency.lower() != "domstolsverket"
def merge(data):
merged_data = {}
sources = list(data.keys())
first_source = sources[0]
other_sources = sources[1:]
print(f"First source: {SOURCE_NAMES[first_source]}.")
first_source_data = data[first_source]
for agency in first_source_data:
merged_data[agency] = {first_source: first_source_data[agency]}
for other_source in other_sources:
print(f"Adding new source: {SOURCE_NAMES[other_source]}")
other_source_data = data[other_source]
new_agencies = {}
all_names = get_all_names(merged_data)
for agency in other_source_data:
matched = False
agency_data = other_source_data[agency]
org_nr = agency_data["org_nr"] if "org_nr" in agency_data else None
if org_nr and not is_court(org_nr, agency):
match = find_match(merged_data, "org_nr", org_nr)
matched = match is not None
if not matched:
agency_names = get_names(agency_data)
agency_names.append(agency)
match, similarity = smart_ratio(agency_names, all_names)
# if match is not None and similarity < 99:
# print(f'{agency} -> {match} ({similarity})')
if match is not None:
if match != agency:
if "other_names" not in agency_data:
agency_data["other_names"] = []
if agency not in agency_data["other_names"]:
agency_data["other_names"].append(agency)
if other_source not in merged_data[match]:
merged_data[match][other_source] = agency_data
else:
if isinstance(merged_data[match][other_source], list):
merged_data[match][other_source].append(agency_data)
else:
merged_data[match][other_source] = [
merged_data[match][other_source],
agency_data,
]
else:
new_agencies[agency] = {other_source: agency_data}
for agency in new_agencies:
merged_data[agency] = new_agencies[agency]
return merged_data
print("Merging...")
result = merge(data)
print("Merge finished.")
print("Writing to file...")
write_json(result, "data/merged")
print("Cleaning...")
clean()