Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

format code with black, yapf, autopep8 and isort #107

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import json
import os
from pathlib import Path
from typing import Callable, Tuple, Dict, Any
from typing import Any, Callable, Dict, Tuple
from warnings import warn

from datasets import Dataset, load_dataset
from transformers import GenerationConfig

from src.grouped_sampling import ReturnOnePipeLine


STAT_NAME_TO_FUNC: Tuple[Tuple[str, Callable], ...] = (
("mean", lambda x: x.mean()),
("median", lambda x: x.median()),
Expand Down Expand Up @@ -49,19 +48,21 @@ def get_project_name(debug: bool = __debug__) -> str:
return "grouped-sampling-debug" if debug else "grouped-sampling-evaluation"


def process_translation_data(
sub_set_name: str, debug: bool
) -> Tuple[Dataset, Dataset, str, str]:
def process_translation_data(sub_set_name: str,
debug: bool) -> Tuple[Dataset, Dataset, str, str]:
spited_sub_set_name = sub_set_name.split("_")
language_code1, language_code2 = spited_sub_set_name[:2]
if debug:
sub_set: Dataset = load_dataset(DATASET_NAME, sub_set_name, split="train[:2]")
sub_set: Dataset = load_dataset(DATASET_NAME,
sub_set_name,
split="train[:2]")
else:
sub_set: Dataset = load_dataset(DATASET_NAME, sub_set_name, split="train")
sub_set: Dataset = load_dataset(DATASET_NAME,
sub_set_name,
split="train")

def rename_keys(
x: Dict[str, Any], input_lang_name: str, output_lang_name: str
) -> Dict[str, str]:
def rename_keys(x: Dict[str, Any], input_lang_name: str,
output_lang_name: str) -> Dict[str, str]:
translation: Dict[str, str] = x["translation"]
return {
input_lang_name: translation[input_lang_name],
Expand Down Expand Up @@ -103,9 +104,8 @@ def create_pipeline(max_batch_size: int) -> ReturnOnePipeLine:

def get_experiment_parameters():
parent_folder = Path(__file__).parent
with open(
os.path.join(parent_folder, "experiment_arguments.json"), "r"
) as json_file:
with open(os.path.join(parent_folder, "experiment_arguments.json"),
"r") as json_file:
experiment_parameters = json.load(json_file)
return experiment_parameters

Expand Down
55 changes: 30 additions & 25 deletions evaluation/baseline_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,41 @@
from typing import Any, Dict, List, Tuple
from warnings import warn

from evaluate import load, EvaluationModule
from datasets import Dataset, get_dataset_config_names
from transformers import TextGenerationPipeline, AutoModelForCausalLM, AutoTokenizer
from evaluate import EvaluationModule, load
from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline

from evaluation.experiment_manager import ExperimentManager
from evaluation import (
lang_code_to_name,
process_translation_data,
DATASET_NAME,
disable_progress_bars,
lang_code_to_name,
process_translation_data,
)
from evaluation.experiment_manager import ExperimentManager

disable_progress_bars()

METRIC_NAME = "bertscore"
metric: EvaluationModule = load(
METRIC_NAME, cache_dir=os.path.join(os.path.dirname(__file__), "metrics", "cache")
)
metric: EvaluationModule = load(METRIC_NAME,
cache_dir=os.path.join(
os.path.dirname(__file__), "metrics",
"cache"))


def process_sub_set_half(
sub_set_half: Dataset, in_lang_code: str, out_lang_code: str
) -> Tuple[List[str], List[str]]:
def process_sub_set_half(sub_set_half: Dataset, in_lang_code: str,
out_lang_code: str) -> Tuple[List[str], List[str]]:
input_lang_name = lang_code_to_name(in_lang_code)
output_lang_name = lang_code_to_name(out_lang_code)
prefix = (
f"Translate {input_lang_name} to {output_lang_name}: \n {input_lang_name}: "
)
postfix = f"\n {output_lang_name}: "
inputs = [prefix + x["translation"][in_lang_code] + postfix for x in sub_set_half]
references: List[str] = [x["translation"][out_lang_code] for x in sub_set_half]
inputs = [
prefix + x["translation"][in_lang_code] + postfix for x in sub_set_half
]
references: List[str] = [
x["translation"][out_lang_code] for x in sub_set_half
]
return inputs, references


Expand All @@ -49,7 +53,8 @@ def sub_experiment_half(
) -> None:
inputs: List[str]
references: List[str]
inputs, references = process_sub_set_half(sub_set_half, in_lang_code, out_lang_code)
inputs, references = process_sub_set_half(sub_set_half, in_lang_code,
out_lang_code)
raw_predictions: List[List[Dict[str, str]]] = pipeline(
inputs,
num_beams=1,
Expand All @@ -69,13 +74,12 @@ def sub_experiment_half(
predictions=predictions,
references=references,
)
scores = metric.compute(
lang=out_lang_code,
)
scores = metric.compute(lang=out_lang_code, )

# noinspection PyTypeChecker

manager.log_sub_experiment(scores, in_lang_code, out_lang_code, sub_set_half)
manager.log_sub_experiment(scores, in_lang_code, out_lang_code,
sub_set_half)


def run_experiment(
Expand Down Expand Up @@ -115,15 +119,14 @@ def run_experiment(


def create_hugging_face_pipeline(
debug: bool,
) -> Tuple[TextGenerationPipeline, Dict[str, Any]]:
debug: bool, ) -> Tuple[TextGenerationPipeline, Dict[str, Any]]:
"""Creates a translation pipeline from hugging face"""
parent_folder = Path(__file__).parent
with open(
os.path.join(parent_folder, "experiment_arguments.json"), "r"
) as json_file:
with open(os.path.join(parent_folder, "experiment_arguments.json"),
"r") as json_file:
evaluated_text_generator_dict = json.load(json_file)
model_name = "gpt2" if debug else evaluated_text_generator_dict["model_name"]
model_name = "gpt2" if debug else evaluated_text_generator_dict[
"model_name"]
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipeline = TextGenerationPipeline(
Expand Down Expand Up @@ -154,7 +157,9 @@ def create_hugging_face_pipeline(
def main(debug: bool = __debug__) -> None:
if debug:
# send a warning
warn("Running in debug mode, only a small subset of the data will be used")
warn(
"Running in debug mode, only a small subset of the data will be used"
)
sub_sut_names = get_dataset_config_names(DATASET_NAME)
if debug:
sub_sut_names = sub_sut_names[:1]
Expand Down
24 changes: 12 additions & 12 deletions evaluation/evaluate_translation.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
from __future__ import annotations


from typing import Any, Dict, List, Union
from warnings import warn

from evaluate import TranslationEvaluator
from datasets import Dataset, get_dataset_config_names
from evaluate import TranslationEvaluator

from evaluation.experiment_manager import ExperimentManager
from evaluation import (
lang_code_to_name,
process_translation_data,
DATASET_NAME,
create_pipeline,
disable_progress_bars,
get_experiment_parameters,
lang_code_to_name,
process_translation_data,
)
from evaluation.experiment_manager import ExperimentManager
from src.grouped_sampling import ReturnOnePipeLine

disable_progress_bars()
Expand All @@ -30,8 +29,7 @@ def sub_experiment_half(
manager: ExperimentManager,
) -> None:
input_lang_name, output_lang_name = lang_code_to_name(
in_lang_code
), lang_code_to_name(out_lang_code)
in_lang_code), lang_code_to_name(out_lang_code)
prefix = (
f"Translate {input_lang_name} to {output_lang_name}: \n {input_lang_name}: "
)
Expand All @@ -45,7 +43,8 @@ def sub_experiment_half(
input_column=in_lang_code,
label_column=out_lang_code,
)
manager.log_sub_experiment(scores, in_lang_code, out_lang_code, sub_set_half)
manager.log_sub_experiment(scores, in_lang_code, out_lang_code,
sub_set_half)


def run_experiment(
Expand Down Expand Up @@ -103,16 +102,17 @@ def create_evaluator() -> TranslationEvaluator:
def main(debug: bool = __debug__) -> None:
if debug:
# send a warning
warn("Running in debug mode, only a small subset of the data will be used")
warn(
"Running in debug mode, only a small subset of the data will be used"
)
sub_sut_names = get_dataset_config_names(DATASET_NAME)
if debug:
sub_sut_names = sub_sut_names[:1]
curr_text_generator = create_pipeline(max_batch_size=32)
curr_evaluator = create_evaluator()
parameters = get_experiment_parameters()
run_experiment(
curr_text_generator, curr_evaluator, sub_sut_names, debug, parameters
)
run_experiment(curr_text_generator, curr_evaluator, sub_sut_names, debug,
parameters)


if __name__ == "__main__":
Expand Down
82 changes: 40 additions & 42 deletions evaluation/experiment_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from datasets import Dataset
from pandas import DataFrame, concat

from cometml_key import get_comet_api_key
from evaluation import (
BERT_SCORES,
STAT_NAME_TO_FUNC,
get_project_name,
lang_code_to_name,
)
from cometml_key import get_comet_api_key


class ExperimentManager:
Expand All @@ -35,20 +35,17 @@ def __init__(self, debug: bool, parameters: Dict[str, Any] = None):
)
self.experiment.log_parameters(parameters)
self.start_time = datetime.now()
self.df = DataFrame(
columns=[
"input_text",
"target_text",
"input_language",
"output_language",
]
+ list(BERT_SCORES)
)
self.df = DataFrame(columns=[
"input_text",
"target_text",
"input_language",
"output_language",
] + list(BERT_SCORES))

def log_stats(self, scores: DataFrame, title: str) -> None:
self.experiment.log_dataframe_profile(
scores, f"{title}_BERT_scores", header=True
)
self.experiment.log_dataframe_profile(scores,
f"{title}_BERT_scores",
header=True)
for score_name in BERT_SCORES:
curr_column = scores[score_name]
score_stats = {
Expand All @@ -60,8 +57,7 @@ def log_stats(self, scores: DataFrame, title: str) -> None:
plt.title(f"Histogram of {title}_{score_name}")
plt.legend()
self.experiment.log_figure(
figure_name=f"{title}_{score_name}_histogram", figure=plt
)
figure_name=f"{title}_{score_name}_histogram", figure=plt)

def log_sub_experiment(
self,
Expand All @@ -80,26 +76,28 @@ def log_sub_experiment(
sub_set: The dataset that was used for this sub experiment half
"""
input_lang_name, output_lang_name = lang_code_to_name(
input_lang_code
), lang_code_to_name(output_lang_code)
input_lang_code), lang_code_to_name(output_lang_code)
self.language_pairs.add((input_lang_name, output_lang_name))
f_1: List[float] = bert_scores["f1"]
precision: List[float] = bert_scores["precision"]
recall: List[float] = bert_scores["recall"]
if not len(f_1) == len(precision) == len(recall):
raise AssertionError
# add scores to the dataframe
new_data: DataFrame = DataFrame.from_dict(
{
"input_language": [input_lang_name] * len(f_1),
"output_language": [output_lang_name] * len(f_1),
"BERT_f1": f_1,
"BERT_precision": precision,
"BERT_recall": recall,
"input_text": sub_set[input_lang_code],
"target_text": sub_set[output_lang_code],
}
)
new_data: DataFrame = DataFrame.from_dict({
"input_language": [input_lang_name] * len(f_1),
"output_language": [output_lang_name] * len(f_1),
"BERT_f1":
f_1,
"BERT_precision":
precision,
"BERT_recall":
recall,
"input_text":
sub_set[input_lang_code],
"target_text":
sub_set[output_lang_code],
})
self.df = concat([self.df, new_data], ignore_index=True, copy=False)
curr_time_diff = datetime.now() - self.start_time
print(f"Translated {len(self.df)} examples in {curr_time_diff}")
Expand All @@ -109,22 +107,22 @@ def end_experiment(self) -> None:
self.log_stats(self.df, "general")
for input_lang, output_lang in self.language_pairs:
pair_scores: DataFrame
pair_scores = self.df[
(self.df["input_language"] == input_lang)
& (self.df["output_language"] == output_lang)
]
pair_scores = self.df[(self.df["input_language"] == input_lang)
&
(self.df["output_language"] == output_lang)]
self.log_stats(pair_scores, f"{input_lang} to {output_lang}")
total_time_in_seconds = (datetime.now() - self.start_time).total_seconds()
total_time_in_seconds = (datetime.now() -
self.start_time).total_seconds()
num_examples = len(self.df)
total_time_in_hours = total_time_in_seconds / 3600
self.experiment.log_metrics(
{
"time in hours": total_time_in_hours,
"seconds per example": total_time_in_seconds / num_examples,
"examples per second": num_examples / total_time_in_seconds,
}
)
self.experiment.log_metrics({
"time in hours":
total_time_in_hours,
"seconds per example":
total_time_in_seconds / num_examples,
"examples per second":
num_examples / total_time_in_seconds,
})
self.experiment.send_notification(
f"Experiment finished successfully in {total_time_in_hours} hours"
)
f"Experiment finished successfully in {total_time_in_hours} hours")
self.experiment.end()
Loading