-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2vec_fun.py
37 lines (28 loc) · 1.66 KB
/
word2vec_fun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python3
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed): #
Elements of each training example are appended to these lists. targets,
contexts, labels = [], [], []
# Build the sampling table for vocab_size tokens. sampling_table =
tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
# Iterate over all sequences (sentences) in dataset. for sequence in
tqdm.tqdm(sequences):
# Generate positive skip-gram pairs for a sequence (sentence).
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
sequence, vocabulary_size=vocab_size, sampling_table=sampling_table,
window_size=window_size, negative_samples=0)
# Iterate over each positive skip-gram pair to produce training examples
# with positive context word and negative samples. for target_word,
# context_word in positive_skip_grams: context_class = tf.expand_dims(
# tf.constant([context_word], dtype="int64"), 1)
# negative_sampling_candidates, _, _ =
# tf.random.log_uniform_candidate_sampler( true_classes=context_class,
# num_true=1, num_sampled=num_ns, unique=True, range_max=vocab_size,
# seed=SEED, name="negative_sampling")
# Build context and label vectors (for one target word)
negative_sampling_candidates = tf.expand_dims(
negative_sampling_candidates, 1)
context = tf.concat([context_class, negative_sampling_candidates], 0)
label = tf.constant([1] + [0]*num_ns, dtype="int64")
# Append each element from the training example to global lists.
targets.append(target_word) contexts.append(context) labels.append(label)
return targets, contexts, labels