Skip to content
This repository has been archived by the owner on Feb 8, 2023. It is now read-only.

feature(Types): added Experiment type encompassing Pipelines, metrics, project_name, etc. #114

Merged
merged 5 commits into from
Jul 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ channels:
- huggingface
dependencies:
- python=3.9
- pyarrow>=3.0
- black
- ipython
- notebook
Expand Down
101 changes: 75 additions & 26 deletions library/examples/hate_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,21 @@
transform_hatespeech_offensive_dataset,
)
from datasets.load import load_dataset
from library.evaluation import calibration_metrics, classification_metrics
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from transformers.training_args import TrainingArguments
from type import (
Experiment,
HuggingfaceConfig,
LoadOrigin,
PreprocessConfig,
RunConfig,
SKLearnConfig,
)
from utils.flatten import remove_none
from utils.flatten import flatten, remove_none

preprocess_config = PreprocessConfig(
train_size=-1,
Expand All @@ -49,7 +50,7 @@
### Models

huggingface_config = HuggingfaceConfig(
preferred_load_origin=LoadOrigin.remote,
preferred_load_origin=LoadOrigin.local,
pretrained_model="distilbert-base-uncased",
user_name="semy",
save_remote=True,
Expand Down Expand Up @@ -163,28 +164,25 @@ def create_nlp_huggingface_pipeline(autocorrect: bool) -> Pipeline:
)

huggingface_baseline = create_nlp_huggingface_pipeline(autocorrect=False)
nlp_sklearn = create_nlp_sklearn_pipeline(autocorrect=False)
nlp_sklearn_autocorrect = create_nlp_sklearn_pipeline(autocorrect=True)
sklearn = create_nlp_sklearn_pipeline(autocorrect=False)
sklearn_autocorrect = create_nlp_sklearn_pipeline(autocorrect=True)

nlp_sklearn_simple = create_nlp_sklearn_pipeline(autocorrect=False)
sklearn_simple = create_nlp_sklearn_pipeline(autocorrect=False)
random = Pipeline("random", input_data, [RandomModel("random")])
vader = Pipeline("vader", input_data, [VaderModel("vader")])

ensemble_pipeline = Ensemble(
"ensemble", [nlp_sklearn, nlp_sklearn_autocorrect, text_statistics_pipeline]
ensemble_all = Ensemble(
"ensemble_all-all",
[sklearn, huggingface_baseline, text_statistics_pipeline, vader],
)

ensemble_pipeline_hf = Ensemble(
"ensemble_hf_sklearn", [nlp_sklearn, huggingface_baseline]
)
ensemble_sklearn_vader = Ensemble("ensemble_sklearn_vader", [sklearn, vader])

ensemble_pipeline_hf_statistic = Ensemble(
"ensemble_hf_statistic", [text_statistics_pipeline, huggingface_baseline]
)
ensemble_sklearn_hf = Ensemble("ensemble_sklearn_hf", [sklearn, huggingface_baseline])

ensemble_pipeline_hf_statistic_sklearn = Ensemble(
"ensemble_hf_statistic_sklearn",
[nlp_sklearn, text_statistics_pipeline, huggingface_baseline],
ensemble_hf_vader = Ensemble(
"ensemble_hf_vader",
[huggingface_baseline],
)


Expand Down Expand Up @@ -228,31 +226,82 @@ def create_nlp_huggingface_pipeline(autocorrect: bool) -> Pipeline:
]
)

### Metrics

metrics = classification_metrics + calibration_metrics


### Run Configs

tweeteval_hate_speech_run_configs = [
RunConfig(
run_name="hate-speech-detection",
tweeteval_hate_speech_experiments = [
Experiment(
project_name="hate-speech-detection",
run_name="tweeteval",
dataset=data_tweet_eval_hate_speech[0],
pipeline=sklearn,
preprocessing_config=preprocess_config,
metrics=metrics,
train=True,
),
RunConfig(
run_name="hate-speech-detection",
Experiment(
project_name="hate-speech-detection",
run_name="tweeteval",
dataset=data_tweet_eval_hate_speech[1],
pipeline=sklearn,
preprocessing_config=preprocess_config,
metrics=metrics,
train=False,
),
]


cross_dataset_run_configs = [
RunConfig(
run_name="hate-speech-detection-cross-val",
cross_dataset_experiments = [
Experiment(
project_name="hate-speech-detection-cross-val",
run_name="merged_dataset",
dataset=data_merged_train,
pipeline=sklearn,
preprocessing_config=preprocess_config,
metrics=metrics,
train=True,
),
RunConfig(
Experiment(
project_name="hate-speech-detection-cross-val",
run_name="hatecheck",
dataset=data_hatecheck[1],
pipeline=sklearn,
preprocessing_config=preprocess_config,
metrics=metrics,
train=False,
),
]

pipelines_to_evaluate = [
sklearn,
sklearn_autocorrect,
random,
vader,
huggingface_baseline,
ensemble_all,
ensemble_hf_vader,
ensemble_sklearn_hf,
ensemble_sklearn_vader,
]


def set_pipeline(experiment: Experiment, pipeline: Pipeline) -> Experiment:
experiment.pipeline = pipeline
return experiment


all_cross_dataset_experiments = flatten(
[
[
[
set_pipeline(experiment, pipeline)
for experiment in cross_dataset_experiments
]
]
for pipeline in pipelines_to_evaluate
]
)
21 changes: 2 additions & 19 deletions library/examples/hate_speech_multi_hf.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,11 @@
from copy import deepcopy

from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from transformers import TrainingArguments

from blocks.adaptors import ListOfListsToNumpy
from blocks.augmenters.spelling_autocorrect import SpellAutocorrectAugmenter
from blocks.data import DataSource
from blocks.ensemble import Ensemble
from blocks.models.huggingface import HuggingfaceModel
from blocks.models.sklearn import SKLearnModel
from blocks.pipeline import Pipeline
from blocks.transformations import (
Lemmatizer,
SKLearnTransformation,
SpacyTokenizer,
TextStatisticTransformation,
)
from configs.constants import Const
from library.evaluation import classification, classification_metrics
from type import HuggingfaceConfig, LoadOrigin, PreprocessConfig, SKLearnConfig
from transformers.training_args import TrainingArguments
from type import HuggingfaceConfig, LoadOrigin, PreprocessConfig
from utils.flatten import remove_none

preprocess_config = PreprocessConfig(
Expand Down
54 changes: 9 additions & 45 deletions library/examples/hate_speech_sklearn.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,19 @@
from datasets.load import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from typing import Tuple, Union

from blocks.augmenters.spelling_autocorrect import SpellAutocorrectAugmenter
from blocks.data import DataSource
from blocks.ensemble import Ensemble
from blocks.models.huggingface import HuggingfaceModel
from blocks.models.sklearn import SKLearnModel
from blocks.pipeline import Pipeline
from blocks.transformations import (
Lemmatizer,
SKLearnTransformation,
SpacyTokenizer,
TextStatisticTransformation,
)
from blocks.transformations import Lemmatizer, SKLearnTransformation, SpacyTokenizer
from blocks.transformations.no_lemmatizer import NoLemmatizer
from configs.constants import Const
from data.transformation import transform_dataset
from library.evaluation import classification
from type import (
HuggingfaceConfig,
LoadOrigin,
PreprocessConfig,
RunConfig,
SKLearnConfig,
)
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from type import LoadOrigin, SKLearnConfig
from utils.flatten import remove_none

preprocess_config = PreprocessConfig(
train_size=100,
val_size=100,
test_size=100,
input_col="text",
label_col="label",
)

sklearn_config = SKLearnConfig(
force_fit=False,
save=True,
Expand Down Expand Up @@ -77,9 +54,9 @@ def create_nlp_sklearn_pipeline(
[
SpellAutocorrectAugmenter(fast=True) if autocorrect else None,
SpacyTokenizer(),
Lemmatizer(remove_stopwords=False)
Lemmatizer(remove_stopwords=True)
if lemmatization
else NoLemmatizer(remove_stopwords=False),
else NoLemmatizer(remove_stopwords=True),
SKLearnTransformation(
TfidfVectorizer(
max_features=tfidf_max_features,
Expand Down Expand Up @@ -142,16 +119,3 @@ def create_nlp_sklearn_pipeline(
sklearn_lemma_1_2_large,
],
)

hate_speech_data = transform_dataset(
load_dataset("tweet_eval", "hate"), preprocess_config
)

run_configs = [
RunConfig(
run_name="hate-speech-detection", dataset=hate_speech_data[0], train=True
),
RunConfig(
run_name="hate-speech-detection", dataset=hate_speech_data[1], train=False
),
]
61 changes: 23 additions & 38 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,54 @@
from typing import List
from typing import List, Optional

from blocks.pipeline import Pipeline
from configs.constants import Const
from library.evaluation import calibration_metrics, classification_metrics
from library.examples.hate_speech import (
cross_dataset_run_configs,
ensemble_pipeline_hf,
huggingface_baseline,
preprocess_config,
tweeteval_hate_speech_run_configs,
vader,
)
from library.examples.hate_speech import all_cross_dataset_experiments
from plugins import WandbConfig, WandbPlugin
from runner.runner import Runner
from type import Evaluators, PreprocessConfig, RunConfig
from type import Experiment


def run(
pipeline: Pipeline,
preprocess_config: PreprocessConfig,
project_id: str,
run_configs: List[RunConfig],
metrics: Evaluators,
experiments: List[Experiment],
save_remote: Optional[
bool
] = None, # If set True all models will try uploading (if configured), if set False it overwrites uploading of any models (even if configured)
remote_logging: Optional[
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like remote_logging was not used anywhere! did we have it wired up on latest main?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is used in one place: in run.py!

        logger_plugins = (
            [
                WandbPlugin(
                    WandbConfig(
                        project_id=project_id,
                        run_name=config.run_name + "-" + pipeline.id,
                        train=True,
                    ),
                    dict(
                        run_config=config.get_configs(),
                        preprocess_config=preprocess_config.get_configs(),
                        pipeline_configs=pipeline.get_configs(),
                    ),
                )
            ]
            if config.remote_logging
            else []
        )```

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, my mistake! I meant save_remote, not remote_logging!

bool
] = None, # Switches on and off all remote logging (eg.: wandb)
) -> None:

for config in run_configs:
for experiment in experiments:
logger_plugins = (
[
WandbPlugin(
WandbConfig(
project_id=project_id,
run_name=config.run_name + "-" + pipeline.id,
project_id=experiment.project_name,
run_name=experiment.run_name + "-" + experiment.pipeline.id,
train=True,
),
dict(
run_config=config.get_configs(),
preprocess_config=preprocess_config.get_configs(),
pipeline_configs=pipeline.get_configs(),
run_config=experiment.get_configs(),
preprocess_config=experiment.preprocessing_config.get_configs(),
pipeline_configs=experiment.pipeline.get_configs(),
),
)
]
if config.remote_logging
if remote_logging
else []
)
runner = Runner(
config,
pipeline,
data={Const.input_col: config.dataset[Const.input_col]},
labels=config.dataset[Const.label_col]
if hasattr(config.dataset, Const.label_col)
else None,
evaluators=metrics,
experiment,
data={Const.input_col: experiment.dataset[Const.input_col]},
labels=experiment.dataset[Const.label_col],
plugins=logger_plugins,
)
runner.run()


if __name__ == "__main__":

metrics = classification_metrics + calibration_metrics

run(
vader,
preprocess_config,
project_id="hate-speech-detection",
run_configs=cross_dataset_run_configs,
metrics=metrics,
all_cross_dataset_experiments,
save_remote=False,
remote_logging=True,
)
Loading