From 84cae04d5ac4f747a73f951fc53cac2f001774ad Mon Sep 17 00:00:00 2001 From: Sindre Eiklid Date: Sun, 15 Oct 2023 21:09:35 +0200 Subject: [PATCH] Add `subsample` method to `ModelCBOW` --- source/architechtures/cbow.py | 2 +- source/datahandler/loaders.py | 49 +++++++++++++++-------------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/source/architechtures/cbow.py b/source/architechtures/cbow.py index be54e1e..ee20b34 100644 --- a/source/architechtures/cbow.py +++ b/source/architechtures/cbow.py @@ -50,5 +50,5 @@ def run() -> None: utils.print_divider() validation_dataloader.plot_analogies_rank(k=20) validation_dataloader.plot_word_pair_similarity() - print(f"Analogy accuracy: {validation_dataloader.analogies_accuracy():.2f}%") + print(f"Analogy accuracy: {(validation_dataloader.analogies_accuracy() * 100):.2f}%") print(f"Spearman correlation coefficient: {validation_dataloader.word_pair_spearman_correlation():.5f}") diff --git a/source/datahandler/loaders.py b/source/datahandler/loaders.py index bfe4b6c..082f640 100644 --- a/source/datahandler/loaders.py +++ b/source/datahandler/loaders.py @@ -1,14 +1,7 @@ from constants import ( PROJECT_DIRECTORY_PATH ) -from utils import ( - save_numpy, - load_numpy, - download_file, - normalize, - cosine_similarity, - save_plot -) +import utils import os import gensim.downloader @@ -95,10 +88,9 @@ def get_token(self, index: int, default: str = None) -> str: def get_frequency(self, token: str, default=0) -> int: return self.token_freq.get(token, default) - def subsample_probability(self, token: str, threshold=1e-5): - """Compute the probability of keeping the given token.""" - freq_ratio = self.get_frequency(token) / self.total_words - return 1 - np.sqrt(threshold / freq_ratio) + def subsample(self, token: str, threshold=1e-5) -> bool: + prob = (np.sqrt(self.get_frequency(token) / (threshold * self.total_words)) + 1) * (threshold * self.total_words) / self.get_frequency(token) + return (prob < np.random.rand()) def __len__(self): return len(self.token_to_index) @@ -124,12 +116,12 @@ def build(self, vocabulary: Vocabulary): filepath_cache = os.path.join(PROJECT_DIRECTORY_PATH, "data", self.data_directory, "validation_data", "analogy_test.npy") if os.path.exists(filepath_cache): # load cache - self.analogy_test = load_numpy(filepath_cache) + self.analogy_test = utils.load_numpy(filepath_cache) else: analogies = [] # download raw data filepath = os.path.join(PROJECT_DIRECTORY_PATH, "data", "analogy_test.txt") - download_file("http://download.tensorflow.org/data/questions-words.txt", filepath) + utils.download_file("http://download.tensorflow.org/data/questions-words.txt", filepath) with open(filepath, "r") as file: for line in file: # skip headers @@ -148,20 +140,20 @@ def build(self, vocabulary: Vocabulary): analogies.append(test) # save to cache self.analogy_test = np.array(analogies) - save_numpy(filepath_cache, self.analogy_test) + utils.save_numpy(filepath_cache, self.analogy_test) progress_bar.update(1) # get wordsim353 test set filepath_cache = os.path.join(PROJECT_DIRECTORY_PATH, "data", self.data_directory, "validation_data", "wordsim353_test.npy") if os.path.exists(filepath_cache): # load cache - self.word_pair_similarity_test = load_numpy(filepath_cache) + self.word_pair_similarity_test = utils.load_numpy(filepath_cache) else: word_pairs = [] # download raw data filepath = os.path.join(PROJECT_DIRECTORY_PATH, "data", "wordsim353_test", "combined.csv") if not os.path.exists(filepath): filepath_zipped = os.path.join(PROJECT_DIRECTORY_PATH, "data", "wordsim353_test.zip") - download_file("https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip", filepath_zipped) + utils.download_file("https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip", filepath_zipped) with zipfile.ZipFile(filepath_zipped, "r") as file: file.extractall(os.path.dirname(filepath)) # parse raw data @@ -181,7 +173,7 @@ def build(self, vocabulary: Vocabulary): word_pairs.append([float(vocabulary.get_index(word1)), float(vocabulary.get_index(word2)), float(sim_score)]) # save to cache self.word_pair_similarity_test = np.array(word_pairs) - save_numpy(filepath_cache, self.word_pair_similarity_test) + utils.save_numpy(filepath_cache, self.word_pair_similarity_test) progress_bar.update(1) def evaluate_analogies(self, embeddings: np.ndarray, quiet=False): @@ -193,9 +185,9 @@ def evaluate_analogies(self, embeddings: np.ndarray, quiet=False): word_vector_3 = embeddings[word3_idx] # compute the analogy vector analogy_vector = word_vector_1 - word_vector_2 - predicted_vector = normalize(word_vector_3 + analogy_vector) + predicted_vector = utils.normalize(word_vector_3 + analogy_vector) # get cosine similarity scores - cosine_similarities: np.ndarray = cosine_similarity(embeddings, predicted_vector) + cosine_similarities: np.ndarray = utils.cosine_similarity(embeddings, predicted_vector) # exclude input words from similarity scores cosine_similarities[word1_idx] = -float("inf") cosine_similarities[word2_idx] = -float("inf") @@ -234,7 +226,7 @@ def plot_analogies_rank(self, k=5): plt.xticks(range(1, k + 1)) plt.grid(axis='y') - save_plot(filepath=os.path.join(PROJECT_DIRECTORY_PATH, "data", self.data_directory, "plots", title + ".png")) + utils.save_plot(filepath=os.path.join(PROJECT_DIRECTORY_PATH, "data", self.data_directory, "plots", title + ".png")) plt.close() def evaluate_word_pair_similarity(self, embeddings: np.ndarray, quiet=False): @@ -246,7 +238,7 @@ def evaluate_word_pair_similarity(self, embeddings: np.ndarray, quiet=False): word_vector_1 = embeddings[int(word1_idx)] word_vector_2 = embeddings[int(word2_idx)] # get cosine similarity - model_score = cosine_similarity(word_vector_1, word_vector_2) + model_score = utils.cosine_similarity(word_vector_1, word_vector_2) model_scores.append(model_score) human_scores.append(human_score) self.word_pair_similarity_model_scores = np.array(model_scores) @@ -266,7 +258,7 @@ def plot_word_pair_similarity(self): m, b = np.polyfit(self.word_pair_similarity_human_scores, self.word_pair_similarity_model_scores, 1) plt.plot(self.word_pair_similarity_human_scores, m * self.word_pair_similarity_human_scores + b, color="red") - save_plot(filepath=os.path.join(PROJECT_DIRECTORY_PATH, "data", self.data_directory, "plots", title + ".png")) + utils.save_plot(filepath=os.path.join(PROJECT_DIRECTORY_PATH, "data", self.data_directory, "plots", title + ".png")) plt.close() @@ -284,8 +276,8 @@ def build(self, sentences: list[list[str]], vocabulary: Vocabulary, window_size: if os.path.exists(context_words_filepath) and os.path.exists(target_words_filepath): progress_bar = tqdm.tqdm(desc="Building training data", total=1) - self.context_words = torch.tensor(load_numpy(context_words_filepath), dtype=torch.long, device=device) - self.target_words = torch.tensor(load_numpy(target_words_filepath), dtype=torch.long, device=device) + self.context_words = torch.tensor(utils.load_numpy(context_words_filepath), dtype=torch.long, device=device) + self.target_words = torch.tensor(utils.load_numpy(target_words_filepath), dtype=torch.long, device=device) self._num_samples = len(self.target_words) progress_bar.update(1) return @@ -294,7 +286,7 @@ def build(self, sentences: list[list[str]], vocabulary: Vocabulary, window_size: target_words = [] for sentence in tqdm.tqdm(sentences, desc="Building training data"): for center_position, center_word in enumerate(sentence): - if center_word not in vocabulary: + if center_word not in vocabulary or vocabulary.subsample(center_word, threshold=1e-5): continue # define the boundaries of the window start_position = max(0, center_position - window_size) @@ -314,12 +306,13 @@ def build(self, sentences: list[list[str]], vocabulary: Vocabulary, window_size: context_words = np.array(context_words) target_words = np.array(target_words) - save_numpy(context_words_filepath, context_words) - save_numpy(target_words_filepath, target_words) + utils.save_numpy(context_words_filepath, context_words) + utils.save_numpy(target_words_filepath, target_words) self.context_words = torch.tensor(context_words, dtype=torch.long, device=device) self.target_words = torch.tensor(target_words, dtype=torch.long, device=device) self._num_samples = len(self.target_words) + utils.plot_target_words_occurances(target_words, data_directory="cbow") def __iter__(self): for start in range(0, self._num_samples, self._batch_size):