Skip to content

Commit

Permalink
Fix syntax errors
Browse files Browse the repository at this point in the history
  • Loading branch information
sindre0830 committed Oct 16, 2023
1 parent 8ded051 commit 4af0059
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 67 deletions.
10 changes: 4 additions & 6 deletions source/datahandler/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def plot_analogies_rank(self, k=5):

rank_counts = [np.sum(self.analogy_similarity_rank == i) for i in range(k)]

title = f"Rank Distribution of Correct Analogy"
title = "Rank Distribution of Correct Analogy"

_, ax = plt.subplots()
ax.bar(range(1, k + 1), rank_counts)
Expand All @@ -235,7 +235,6 @@ def plot_analogies_rank(self, k=5):
def evaluate_word_pair_similarity(self, embeddings: np.ndarray, quiet=False):
model_scores = []
human_scores = []
#print(self.word_pair_similarity_test)
for word1_idx, word2_idx, human_score in tqdm.tqdm(self.word_pair_similarity_test, desc="Evaluating WordSim353 test", disable=quiet):
# get vector representations
word_vector_1 = embeddings[int(word1_idx)]
Expand Down Expand Up @@ -385,7 +384,7 @@ class DataLoaderCooccurrence:
def __init__(self, batch_size: int):
self._token_ids = None
self._cooccurr_counts = None

self._num_samples = 0
self._batch_size = batch_size

Expand Down Expand Up @@ -415,15 +414,14 @@ def build(self, words: list[str], vocabulary: Vocabulary, window_size: int, devi
if context_word_idx == vocabulary.unknown_index or context_word_idx == vocabulary.padding_index:
continue
cooccurrence_matrix[word_idx, context_word_idx] += 1.0

cooccurrence_matrix = cooccurrence_matrix.tocoo()
utils.save_npz(cooccurrence_matrix_filepath, cooccurrence_matrix)

self._token_ids = torch.tensor(np.array(list(zip(cooccurrence_matrix.row, cooccurrence_matrix.col))), dtype=torch.long, device=device)
self._cooccurr_counts = torch.tensor(cooccurrence_matrix.data, dtype=torch.float32, device=device)
self._num_samples = len(self._token_ids)


def __iter__(self):
for start in range(0, self._num_samples, self._batch_size):
end = min(start + self._batch_size, self._num_samples)
Expand Down
102 changes: 51 additions & 51 deletions source/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@

class ModelGloVe(torch.nn.Module):
def __init__(
self,
device: str,
vocabulary_size: int,
embedding_size: int,
x_max: float,
alpha: float,
padding_idx: int = None
):
self,
device: str,
vocabulary_size: int,
embedding_size: int,
x_max: float,
alpha: float,
padding_idx: int = None
):
super().__init__()
self.device = device
self.filepath = os.path.join(PROJECT_DIRECTORY_PATH, "data", "glove", "model.pt")
Expand Down Expand Up @@ -77,21 +77,21 @@ def forward(self, word_index: torch.Tensor, context_index: torch.Tensor, cooccur
x_scaled = (cooccurrence_count / self.x_max).pow(self.alpha)
weighted_error = (x_scaled.clamp(0, 1) * (prediction - cooccurrence_count.log()) ** 2).mean()
return weighted_error

def validate(self, validation_dataloader: datahandler.loaders.ValidationLoader) -> float:
validation_dataloader.evaluate_analogies(self.get_embeddings(), quiet=True)
return validation_dataloader.analogies_accuracy()

def fit(
self,
training_dataloader: datahandler.loaders.DataLoaderCooccurrence,
validation_dataloader: datahandler.loaders.ValidationLoader,
learning_rate: float,
max_epochs: int,
min_loss_improvement: float,
patience: int,
validation_interval: int
):
self,
training_dataloader: datahandler.loaders.DataLoaderCooccurrence,
validation_dataloader: datahandler.loaders.ValidationLoader,
learning_rate: float,
max_epochs: int,
min_loss_improvement: float,
patience: int,
validation_interval: int
):
# check if cache exists
if os.path.exists(self.filepath):
progress_bar = tqdm.tqdm(desc="Loading cached model", total=1)
Expand Down Expand Up @@ -170,12 +170,12 @@ def fit(

class ModelCBOW(torch.nn.Module):
def __init__(
self,
device: str,
vocabulary_size: int,
embedding_size: int,
padding_idx: int = None
):
self,
device: str,
vocabulary_size: int,
embedding_size: int,
padding_idx: int = None
):
super().__init__()
self.device = device
self.filepath = os.path.join(PROJECT_DIRECTORY_PATH, "data", "cbow", "model.pt")
Expand Down Expand Up @@ -224,21 +224,21 @@ def forward(self, context_words_idx):
context_vector = torch.sum(self.input_embeddings(context_words_idx), dim=1)
output = torch.matmul(context_vector, self.output_embeddings.weight.t())
return torch.nn.functional.log_softmax(output, dim=1)

def validate(self, validation_dataloader: datahandler.loaders.ValidationLoader) -> float:
validation_dataloader.evaluate_analogies(self.get_embeddings(), quiet=True)
return validation_dataloader.analogies_accuracy()

def fit(
self,
training_dataloader: datahandler.loaders.DataLoaderCBOW,
validation_dataloader: datahandler.loaders.ValidationLoader,
learning_rate: float,
max_epochs: int,
min_loss_improvement: float,
patience: int,
validation_interval: int
):
self,
training_dataloader: datahandler.loaders.DataLoaderCBOW,
validation_dataloader: datahandler.loaders.ValidationLoader,
learning_rate: float,
max_epochs: int,
min_loss_improvement: float,
patience: int,
validation_interval: int
):
# check if cache exists
if os.path.exists(self.filepath):
progress_bar = tqdm.tqdm(desc="Loading cached model", total=1)
Expand Down Expand Up @@ -315,12 +315,12 @@ def fit(

class ModelSkipGram(torch.nn.Module):
def __init__(
self,
device: str,
vocabulary_size: int,
embedding_size: int,
padding_idx: int = None
):
self,
device: str,
vocabulary_size: int,
embedding_size: int,
padding_idx: int = None
):
super().__init__()
self.device = device
self.filepath = os.path.join(PROJECT_DIRECTORY_PATH, "data", "skipgram", "model.pt")
Expand Down Expand Up @@ -369,21 +369,21 @@ def forward(self, target_words):
target_vector = self.input_embeddings(target_words)
output = torch.matmul(target_vector, self.output_embeddings.weight.t())
return torch.nn.functional.log_softmax(output, dim=1)

def validate(self, validation_dataloader: datahandler.loaders.ValidationLoader) -> float:
validation_dataloader.evaluate_analogies(self.get_embeddings(), quiet=True)
return validation_dataloader.analogies_accuracy()

def fit(
self,
training_dataloader: datahandler.loaders.DataLoaderCBOW,
validation_dataloader: datahandler.loaders.ValidationLoader,
learning_rate: float,
max_epochs: int,
min_loss_improvement: float,
patience: int,
validation_interval: int
):
self,
training_dataloader: datahandler.loaders.DataLoaderCBOW,
validation_dataloader: datahandler.loaders.ValidationLoader,
learning_rate: float,
max_epochs: int,
min_loss_improvement: float,
patience: int,
validation_interval: int
):
# check if cache exists
if os.path.exists(self.filepath):
progress_bar = tqdm.tqdm(desc="Loading cached model", total=1)
Expand Down
20 changes: 10 additions & 10 deletions source/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ def download_file(url: str, save_path: str):
response = requests.get(url, allow_redirects=True)
# ensure the request was successful
response.raise_for_status()

with open(save_path, 'wb') as file:
file.write(response.content)


def normalize(x: np.ndarray, axis = None, keepdims = False) -> np.ndarray:
def normalize(x: np.ndarray, axis=None, keepdims=False) -> np.ndarray:
return x / np.linalg.norm(x, axis=axis, keepdims=keepdims)


Expand All @@ -106,12 +106,12 @@ def get_model_progressbar(iter, epoch: int, max_epochs: int) -> tqdm.tqdm:


def set_model_progressbar_prefix(
progressbar: tqdm.tqdm,
train_loss: float = 0.0,
best_loss: float = 0.0,
train_acc: float = 0.0,
best_acc: float = 0.0
):
progressbar: tqdm.tqdm,
train_loss: float = 0.0,
best_loss: float = 0.0,
train_acc: float = 0.0,
best_acc: float = 0.0
):
"""
Set prefix in progressbar and update output.
"""
Expand Down Expand Up @@ -142,7 +142,7 @@ def plot_loss_and_accuracy(loss_history: list[float], accuracy_history: list[flo
ax2.tick_params(axis='y', labelcolor="blue")
# combine legends
lines = [line1, line2]
labels = [l.get_label() for l in lines]
labels = [line.get_label() for line in lines]
ax1.legend(lines, labels, loc="upper left")

fig.tight_layout(pad=3.0)
Expand Down Expand Up @@ -170,7 +170,7 @@ def plot_frequency_distribution(corpus, data_directory: str):
# check if it already exists
title = "Word Frequencies in Descending Order"
filepath = os.path.join(PROJECT_DIRECTORY_PATH, "data", data_directory, "plots", f"{title}.png")

word_freq = collections.Counter(corpus)
word_freq = sorted(word_freq.values(), reverse=True)
ranks = np.arange(1, len(word_freq) + 1)
Expand Down

0 comments on commit 4af0059

Please sign in to comment.