-
Notifications
You must be signed in to change notification settings - Fork 4
/
word_pair_similarity_matrix.py
211 lines (158 loc) · 10.1 KB
/
word_pair_similarity_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# A series of functions designed to quickly and painlessly create a word-word similarity matrix in a hyperdimensional LSA/LSI space.
# Includes three separate functions that can be used in order:
# create_lsi_lookup_table: Create an LSI lookup table for later use in calculating word-word similarity scores
# create_similarity_matrix: Create a matrix of word-word pairs, complete with word, location in LSI dictionary, and hyperdimensional vector
# calculate_similarity_matrix: Calculate pairwise distance scores; implement with np.apply_along_axis (axis=1)
# Includes an additional function that combines all three of the earlier functions without intervening steps:
# all_in_one_similiarity_matrix
# Written by: A. Paxton
# Date last modified: 24 November 2015
import pandas as pd
import scipy.spatial.distance as dist
import gensim, itertools
import numpy as np
from ast import literal_eval
####
# create an LSI lookup table for later use in calculating word-word similarity scores
def create_lsi_lookup_table(unique_word_vector,target_dictionary,target_lsi_model,nd):
# unique_word_vector: 1D string vector to include in lookup table
# target_dictionary: gensim-created dictionary that must contain all words in unique_word_vector
# target_lsi_model: gensim-created LSI model generated with target_dictionary
# nd: number of dimensions used in LSI creation
# create lookup table from word vector
lookup_table = pd.DataFrame(unique_word_vector,columns=['word'])
# create a function to look up the dictionary location and return as tuple-string
def word_lookup(word_vector,dictionary):
return str(dictionary.doc2bow([word_vector[0]]))
# apply new function to find words, then convert from tuple to
lookup_table['loc'] = np.apply_along_axis(word_lookup,1,lookup_table,target_dictionary)
lookup_table['loc'] = lookup_table['loc'].replace('\]|\[','',regex=True)
lookup_table['loc'] = lookup_table['loc'].apply(literal_eval)
# look up hyperdimensional vectors for each word and add them to the lookup table
lookup_table['hdv'] = 0
lookup_table['hdv'] = lookup_table['hdv'].astype(object)
for next_word in lookup_table['word']:
next_location = lookup_table['loc'].loc[lookup_table['word']==next_word].index[0]
next_hdv = np.array([val for (dim, val) in target_lsi_model[[lookup_table['loc'].loc[next_location]]]], dtype=object)
if len(next_hdv) == nd:
lookup_table['hdv'].loc[next_location] = next_hdv
else:
lookup_table['hdv'].loc[next_location] = 0
lookup_table['loc'].loc[next_location] = 0
# remove any lines that didn't have the full nd dimensions and then reset the index
lookup_table = lookup_table.loc[-(lookup_table['loc']==0)].reset_index().drop(['index'],1)
# spit out lookup table
return lookup_table
####
# create a matrix of word-word pairs, complete with word, location in LSI dictionary, and hyperdimensional vector
def create_similarity_matrix(unique_word_vector, lookup_table, word_varname, loc_varname, hdv_varname, winnow_variable, ascending):
# unique_word_vector: 1D string vector over which to calculate all possible pairwise similarity scores
# lookup_table: pandas dataframe with (minimally) all words in word_list and their hyperdimensional value
# word_varname: name of column in lookup_table dataframe that includes words
# hdv_varname: name of column in lookup_table dataframe that includes hyperdimensional vectors
# loc_varname: name of column in lookup_table dataframe that includes dictionary location
# winnow_variable: 1D vector of strings that must be present in a pair to be included in final matrix; set to [] if not desired
# ascending: (optionally) sort ascending in 'word1' and then 'word2'
# make non-repeating word pairs
word_pairs = pd.DataFrame(list(itertools.combinations(unique_word_vector,2)),columns=['word1','word2'])
# create duplicates of the lookup table for the 'word1' and 'word2' columns of word_pairs
lt1 = lookup_table
lt1 = lt1.rename(columns={word_varname:'word1',loc_varname:'loc1',hdv_varname:'hdv1'})
lt2 = lookup_table
lt2 = lt2.rename(columns={word_varname:'word2',loc_varname:'loc2',hdv_varname:'hdv2'})
# merge both with word_pairs
first_merge = pd.merge(word_pairs, lt1, how='inner')
word_pairs = pd.merge(first_merge, lt2,how='inner')
# reduce word_pairs so that one member of pair must be a member of target word group
if len(winnow_variable) > 0:
target_word_matrix1 = word_pairs.loc[word_pairs['word1'].isin(winnow_variable)]
target_word_matrix2 = word_pairs.loc[word_pairs['word2'].isin(winnow_variable)]
word_pairs = target_word_matrix1.append(target_word_matrix2)
# if desired, sort ascending
if ascending==True:
word_pairs = word_pairs.sort_values(by=['word1','word2'],ascending=True)
# reset the index and drop old index column
word_pairs = word_pairs.reset_index().drop(['index'],1)
# when we're done, spit out the completed pandas dataframe
return word_pairs
####
# calculate pairwise distance scores; implement with np.apply_along_axis (axis=1)
def calculate_similarity_matrix(similarity_matrix,first_hdv_column_loc,second_hdv_column_loc):
# similarity_matrix: minimially, a pandas dataframe with 2 columns of np.arrays with hyperdimensional vectors
# first_hdv_column_loc: location of column in similarity_matrix of first hyperdimensional vector
# second_hdv_column_loc: location of column in similarity_matrix of second hyperdimensional vector
# grab separate words from input vector
loc1 = similarity_matrix[first_hdv_column_loc]
loc2 = similarity_matrix[second_hdv_column_loc]
# cycle through each word pair and calculate the similarity
return -1*dist.cosine(loc1,loc2)+1 # thanks to Rick Dale for this snippet
####
# implement all three of the earlier functions in a single function that will only spit out final similarity matrix
def all_in_one_similiarity_matrix(unique_word_vector,target_dictionary,target_lsi_model,nd,winnow_variable,ascending):
# unique_word_vector: 1D string vector to include in lookup table
# target_dictionary: gensim-created dictionary that must contain all words in unique_word_vector
# target_lsi_model: gensim-created LSI model generated with target_dictionary
# nd: number of dimensions used in LSI creation
# winnow_variable: 1D vector of strings that must be present in a pair to be included in final matrix; set to [] if not desired
# ascending: (optionally) sort ascending in 'word1' and then 'word2'
import pandas as pd
import scipy.spatial.distance as dist
import gensim, itertools
import numpy as np
###
# STEP ONE: create LSI lookup table (standalone: create_lsi_lookup_table)
# create lookup table from word vector
lookup_table = pd.DataFrame(unique_word_vector,columns=['word'])
# create a function to look up the dictionary location and return as tuple-string
def word_lookup(word_vector,dictionary):
return str(dictionary.doc2bow([word_vector[0]]))
# apply new function to find words, then convert from tuple to
lookup_table['loc'] = np.apply_along_axis(word_lookup,1,lookup_table,target_dictionary)
lookup_table['loc'] = lookup_table['loc'].replace('\]|\[','',regex=True)
lookup_table['loc'] = lookup_table['loc'].apply(literal_eval)
# look up hyperdimensional vectors for each word and add them to the lookup table
lookup_table['hdv'] = 0
lookup_table['hdv'] = lookup_table['hdv'].astype(object)
for next_word in lookup_table['word']:
next_location = lookup_table['loc'].loc[lookup_table['word']==next_word].index[0]
next_hdv = np.array([val for (dim, val) in target_lsi_model[[lookup_table['loc'].loc[next_location]]]], dtype=object)
if len(next_hdv) == nd:
lookup_table['hdv'].loc[next_location] = next_hdv
else:
lookup_table['hdv'].loc[next_location] = 0
lookup_table['loc'].loc[next_location] = 0
# remove any lines that didn't have the full nd dimensions and then reset the index
lookup_table = lookup_table.loc[-(lookup_table['loc']==0)].reset_index().drop(['index'],1)
###
# STEP TWO: assemble first part of similarity matrix (standalone: create_similarity_matrix)
# make non-repeating word pairs
word_pairs = pd.DataFrame(list(itertools.combinations(unique_word_vector,2)),columns=['word1','word2'])
# create duplicates of the lookup table for the 'word1' and 'word2' columns of word_pairs
lt1 = lookup_table
lt1 = lt1.rename(columns={'word':'word1','loc':'loc1','hdv':'hdv1'})
lt2 = lookup_table
lt2 = lt2.rename(columns={'word':'word2','loc':'loc2','hdv':'hdv2'})
# merge both with word_pairs
first_merge = pd.merge(word_pairs, lt1, how='inner')
word_pairs = pd.merge(first_merge, lt2,how='inner')
# reduce word_pairs so that one member of pair must be a member of target word group
if len(winnow_variable) > 0:
target_word_matrix1 = word_pairs.loc[word_pairs['word1'].isin(winnow_variable)]
target_word_matrix2 = word_pairs.loc[word_pairs['word2'].isin(winnow_variable)]
word_pairs = target_word_matrix1.append(target_word_matrix2)
# if desired, sort ascending
if ascending==True:
word_pairs = word_pairs.sort_values(by=['word1','word2'],ascending=True)
# reset the index and drop old index column
word_pairs = word_pairs.reset_index().drop(['index'],1)
###
# STEP THREE: calculate pairwise distance scores (standalone: calculate_similarity_matrix)
# see notes in calculate_similarity_matrix for description
def calculate_similarity_matrix(similarity_matrix,first_hdv_column_loc,second_hdv_column_loc):
loc1 = similarity_matrix[first_hdv_column_loc]
loc2 = similarity_matrix[second_hdv_column_loc]
return -1 * dist.cosine(loc1,loc2) + 1 # thanks to Rick Dale for this snippet
word_pairs['cosine'] = np.apply_along_axis(calculate_similarity_matrix,1,word_pairs,3,5)
###
# when we're done, spit out the completed dataframe
return word_pairs