Adding README.md and a simple python script to check the nearest words

hardikp · Apr 27, 2017 · 6cc1b47 · 6cc1b47
1 parent d83de07
commit 6cc1b47
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,32 @@
+# fnlp
+
+This repo contains scripts to gather finance data and train NLP models using the text data.
+
+## Word Vectors
+
+Trained word vectors are available on the [releases](https://github.com/hardikp/fnlp/releases) page.
+
+Let's check if the closest words make sense.
+
+```bash
+$ python3 test_word_vectors.py --word IRA
+Roth
+SEP
+IRAs
+401
+retirement
+
+$ python3 test_word_vectors.py --word option
+call
+put
+options
+exercise
+underlying
+
+$ python3 test_word_vectors.py --word stock
+shares
+market
+stocks
+share
+price
+```
diff --git a/test_word_vectors.py b/test_word_vectors.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import, division, print_function
+
+from argparse import ArgumentParser
+from collections import Counter
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+
+def print_nearest_words(args):
+    # Load the word vectors
+    embeddings_index = {}
+    f = open(args.vectors)
+    for line in f:
+        values = line.split(' ')
+        w = values[0]
+        coefs = np.asarray(values[1:], dtype='float32')
+        embeddings_index[w] = coefs
+    f.close()
+
+    # Get the similarity scores
+    score_dict = {}
+    for w in embeddings_index.keys():
+        if args.word == w:
+            continue
+
+        score = cosine_similarity(embeddings_index[args.word].reshape(1, -1), embeddings_index[w].reshape(1, -1))[0][0]
+        score_dict[w] = score
+
+    closest = Counter(score_dict).most_common(args.num_words)
+
+    for word, score in closest:
+        if args.verbose:
+            print(score, word)
+        else:
+            print(word)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--vectors', default='vectors.txt', help='Word vector file')
+    parser.add_argument('--vocab', default='vocab.txt', help='Vocab file')
+    parser.add_argument('--word', default='dollar', help='Input word')
+    parser.add_argument('--verbose', type=bool, default=False, help='Print score')
+    parser.add_argument('--num_words', type=int, default=5, help='Number of closest words to print')
+    args = parser.parse_args()
+
+    print_nearest_words(args)