-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexample.py
54 lines (38 loc) · 1.39 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import csv
import random
from itertools import combinations
import highered
POSITIVE_SAMPLE = 200
names = {}
all_names = set()
addresses = {}
all_addresses = set()
with open('restaurant-nophone-training.csv') as f :
reader = csv.DictReader(f)
for row in reader :
names.setdefault(row['unique_id'], []).append(row['name'])
all_names.add(row['name'])
addresses.setdefault(row['unique_id'], []).append(row['address'].strip())
all_addresses.add(row['address'])
names = {k : v for k, v in names.items() if len(v) > 1}
addresses = {k : v for k, v in addresses.items() if len(v) > 1}
all_names = list(all_names)
names_1 = all_names[:]
random.shuffle(names_1)
all_addresses = list(all_addresses)
addresses_1 = all_addresses[:]
random.shuffle(addresses_1)
positive_examples = []
for entity_id in random.sample(names.keys(), POSITIVE_SAMPLE/2) :
positive_examples.append(names[entity_id][:2])
for entity_id in random.sample(addresses.keys(), POSITIVE_SAMPLE/2) :
positive_examples.append(addresses[entity_id][:2])
negative_examples = zip(names_1, all_names)[:POSITIVE_SAMPLE]
negative_examples += zip(addresses_1, all_addresses)[:POSITIVE_SAMPLE]
ed = highered.CRFEditDistance()
X = positive_examples + negative_examples
Y = ['match'] * POSITIVE_SAMPLE + ["non-match"] * POSITIVE_SAMPLE*2
print X, Y
ed.train(X, Y)
print(ed.model.parameters)
print(ed('foo', 'bar'))