-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy patha_names_script_v2.py
161 lines (130 loc) · 5.79 KB
/
a_names_script_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# CURRENT
# Riva Tropp
# 7/27/2015
import pylev
import re
import sys
#Function to give numbers larger weight for distance
def samewords(name1, name2): #Checks if two phrases consist of the same words.
set1 = set()
set2 = set()
arr1 = name1.split()
arr2 = name2.split()
for v in arr1:
set1.add(v)
for x in arr2:
set2.add(x)
if set1 == set2:
return -2
else:
return 0
def distanceoffset(name1, name2): #Checks if the numbers in the strings are the same.
a = re.compile(r'[^0-9]*([0-9]+)[^0-9]*')
if (not a.match(name1) and a.match(name2)) or (not a.match(name2) and a.match(name1)):
#print >> sys.stderr, name1 + " has numbers ", name2 + " doesn't."
return 2
if not a.match(name1): #(and not a.match(name2)):
return 0 #Don't affect the distance at all.
match1 = re.findall(a, name1) #Find all the numbers in the string.
match2 = re.findall(a, name2) #Find all the numbers in the string.
set1 = set(match1)
set2 = set(match2)
if len(set1) != len(set2): #Different amounts of numbers in the string.
if match1[0] != match2[0]:
return len(max(name1, name2)) #Return the largest length as the distance so practically any match will be better.
else:
#print >> sys.stderr, "One of your strings has more numbers than the other, but the first pair matches."
return 2
else: #If there are the same amount of numbers in the string...
if set1 != set2:
#print >> sys.stderr, "Your numbers don't match."
return len(max(name1, name2))
else:
return -5 #Same # of numbers, same numbers.
#Returns a given phrase without any of the spaces
def wordnospaces(name1):
#Change to simple regex? Eh, don't mess with the merge.
arr = name1.split()
name1_ns = ""
for x in arr:
name1_ns = name1_ns + x
return name1_ns
#Checks if one string is inside the other.
def isinside(name1, name2):
if name1 in name2 or name2 in name1:
return -2
return 0
#Invoked when entering the files in, changes forms of 'avenue' into 'av'.
def one_ave(string1, pattern, string2):
arr1 = string1.replace('-', ' ').rsplit()
outbound = ""
for x in xrange(0, len(arr1)):
if pattern.match(arr1[x]):
arr1[x] = string2
outbound = outbound + " " + arr1[x]
return outbound
#Further penalties if distance == one of the strings, to prevent overmatching small strings.
def penalize(string1, string2):
if pylev.levenshtein(string1, string2) > min(len(string1), len(string2)):
return 3
return 0
################################################################################
#Actual code starts here:
################################################################################
#Files are already lowercase. (Command line).
f1 = open("ts2.txt")
f2 = open("stops2.txt")
turns = f1.readlines()
gtfs = f2.readlines()
#Making lists to read all the station names into.
turn_terms = [] #Turnstile
gtfs_terms = [] #gtfs
orig_turn = []
orig_gtfs = []
r_best = {}
pattern = re.compile(r'av[enu]+')
#Don't give me any PATH trains.
path = set(["NEWARK BM BW", "NEWARK C", "NEWARK HM HE", "NEWARK HW BMEBE", "HARRISON", "JOURNAL SQUARE", "GROVE STREET", "EXCHANGE PLACE", "PAVONIA NEWPORT", "CHRISTOPHER ST", "CITY BUS"]) #Hard Coded Path trains.
#Don't give me any Staten Island Railroad stations.
SIRS = set(["Nassau", "Annadale", "Tottenville", "Stapleton", "Clifton", "Grasmere", "Old Town", "Dongan Hills", "Jefferson Av", "Grant City", "New Dorp", "Oakwood Heights", "Bay Terrace", "Great Kills", "Eltingville", "Huguenot", "Prince's Bay", "Pleasant Plains", "Richmond Valley", "Nassau", "Atlantic"])
for t in turns: #Do some formatting, put in a list, put original in an identically indexed list.
a = t.strip('"').replace("/", ' ').replace("-", " ").strip()
if a not in path:
temp1 = one_ave(a.lower(), pattern, "av")
turn_terms.append(temp1)
orig_turn.append(t)
f1.close()
for g in gtfs: #Same thing for GTFS
a = g.replace('"', '').replace("/", ' ').replace("-", " ").strip()
if a not in SIRS:
temp1 = one_ave(a.lower(), pattern, "av")
gtfs_terms.append(temp1)
orig_gtfs.append(g)
f2.close()
bestmatches = {} #Where we'll store matches.
#Compare each station in the turnstile data to each station in the gtfs feed.
for t in xrange(0, len(turn_terms)):
for g in xrange(0, len(gtfs_terms)):
#Compute distance:
tinylist = [int(distanceoffset(turn_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], turn_terms[t])) + int(isinside(turn_terms[t], gtfs_terms[g])) + int(samewords(turn_terms[t], gtfs_terms[g])) + int(penalize(gtfs_terms[g], turn_terms[t])), orig_gtfs[g], gtfs_terms[g], orig_turn[t]]
#Make the highest default so anything better will take its place.
bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
r_best.setdefault(g, [len(gtfs_terms[g])])
#Check against previous, update if it's a better match for both words than the things they matched before.
if tinylist[0] < bestmatches[turn_terms[t]][0] and tinylist[0] < r_best[g]:
bestmatches[turn_terms[t]] = tinylist
r_best[g] = [tinylist[0], turn_terms[t]]
# print turn_terms[t], tinylist
# if "av n" in turn_terms[t]:
# print bestmatches[turn_terms[t]], tinylist
f3 = open('./matchtable.txt', 'w') #Now stick it all in a nice file.
for g in bestmatches:
f3.write(g + ",")
for x in xrange(0, len(bestmatches[g])):
if x == len(bestmatches[g])-1:
f3.write(str(bestmatches[g][x]).strip().strip('"'))
else:
f3.write(str(bestmatches[g][x]).strip().strip('"') + ",")
f3.write("\n")
f3.close()
#print bestmatches