forked from bfelbo/DeepMoji
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocab_extension.py
30 lines (24 loc) · 938 Bytes
/
vocab_extension.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""
Extend the given vocabulary using dataset-specific words.
1. First create a vocabulary for the specific dataset.
2. Find all words not in our vocabulary, but in the dataset vocabulary.
3. Take top X (default=1000) of these words and add them to the vocabulary.
4. Save this combined vocabulary and embedding matrix, which can now be used.
"""
from __future__ import print_function
import example_helper
import json
from deepmoji.create_vocab import extend_vocab, VocabBuilder
from deepmoji.word_generator import WordGenerator
new_words = [u'#zzzzaaazzz', u'newword', u'newword']
word_gen = WordGenerator(new_words)
vb = VocabBuilder(word_gen)
vb.count_all_words()
with open('../model/vocabulary.json') as f:
vocab = json.load(f)
print(len(vocab))
print(vb.word_counts)
extend_vocab(vocab, vb, max_tokens=1)
# 'newword' should be added because it's more frequent in the given vocab
print(vocab[u'newword'])
print(len(vocab))