-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_data.py
155 lines (123 loc) · 3.36 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import csv
import gzip
import json
import time
import numpy as np
import pandas as pd
from address.proc_gen import TokenCategory as TC
import address.proc_gen as pg
# constants and functions
SEED = 128
NUM_TRAIN_PTS = 192000
NUM_TEST_PTS = 32000
def create_input_output_pair(genmap, tmpls, rng):
idx = rng.choice(len(tmpls))
tokens = pg.sample_address_tokens(genmap, tmpls[idx])
address, clf = pg.join_address_tokens(tokens, tmpls[idx])
clf_chars = ['{}'.format(e.value) for e in clf]
input_seq = ''.join(address)
output_seq = ' '.join(clf_chars)
return input_seq, output_seq
# define random number generator
rng = np.random.default_rng(SEED)
#
# define generators
#
# unit numbers
df_unit = pd.read_csv(
'datasets/unit_designation.txt', names=['values'], dtype='string', na_filter=False)
g_unit = pg.AlphanumGenerator(
nmin=1, nmax=10000,
name='unit',
rng=rng,
desig=df_unit['values'],
)
# house numbers
g_house_num = pg.AlphanumGenerator(
nmin=1, nmax=50000,
name='house_num',
rng=rng,
)
# street names
df_st_name = pd.read_csv(
'datasets/street_names.txt', names=['values'], dtype='string')
g_st_name = pg.UniformListSampler(
name='st_name',
rng=rng,
values=df_st_name['values'],
)
# street type
df_st_type = pd.read_csv(
'datasets/street_types.txt', names=['values'], dtype='string')
g_st_type = pg.UniformListSampler(
name='st_type',
rng=rng,
values=df_st_type['values'],
)
# direction
df_dir = pd.read_csv(
'datasets/directions.txt', names=['values'], dtype='string')
g_dir = pg.UniformListSampler(
name='dir',
rng=rng,
values=df_dir['values'],
)
# city
df_city = pd.read_csv(
'datasets/cities.txt', names=['values'], dtype='string')
g_city = pg.UniformListSampler(
name='city',
rng=rng,
values=df_city['values'],
)
# province or territory
df_prov = pd.read_csv(
'datasets/provinces.txt', names=['values'], dtype='string')
g_prov = pg.UniformListSampler(
name='prov',
rng=rng,
values=df_prov['values'],
)
# postal code
g_postcode = pg.PostalCodeGenerator(
name='postcode',
sep_prob=.5,
rng=rng,
)
#
# prepare procedural generation
#
# load address templates
with open('datasets/templates.json', 'r') as jsonfile:
templates = json.load(jsonfile)
# define template-string-to-generator mapping
generator_map = {
'un' : g_unit,
'hn' : g_house_num,
'sn' : g_st_name,
'st' : g_st_type,
'dp' : g_dir,
'ds' : g_dir,
'ci' : g_city,
'pr' : g_prov,
'po' : g_postcode,
}
#
# generate and save data
#
train_data_path = 'datasets/train_sequences_pg.csv.gz'
test_data_path = 'datasets/test_sequences_pg.csv.gz'
# training data
with gzip.open(train_data_path, 'wt', newline='') as trainfile:
for _ in range(NUM_TRAIN_PTS):
writer = csv.writer(trainfile, delimiter='|')
pair = create_input_output_pair(generator_map, templates, rng)
writer.writerow(pair)
print(f"Saved {NUM_TRAIN_PTS} training points to '{train_data_path}'.")
# synthetic test data
with gzip.open(test_data_path, 'wt', newline='') as testfile:
for _ in range(NUM_TEST_PTS):
writer = csv.writer(testfile, delimiter='|')
pair = create_input_output_pair(generator_map, templates, rng)
writer.writerow(pair)
print(f"Saved {NUM_TEST_PTS} test points to '{test_data_path}'.")