From 5219fe7ca39788cee3e4087176329bbada241da3 Mon Sep 17 00:00:00 2001
From: Thilina Rajapakse <thilinarajapakshe@gmail.com>
Date: Mon, 18 Dec 2023 16:25:57 +0100
Subject: [PATCH] Bump version

---
 examples/t5/mixed_tasks/data_prep.ipynb       | 95 +++++++++++++------
 examples/t5/mt5/data_prep.ipynb               | 95 +++++++++++++------
 setup.py                                      |  3 +-
 .../classification/classification_model.py    |  9 +-
 .../classification/classification_utils.py    |  1 -
 .../multi_label_classification_model.py       |  1 -
 .../multi_modal_classification_model.py       |  1 -
 .../transformer_models/albert_model.py        |  1 -
 .../transformer_models/bert_model.py          |  1 -
 .../transformer_models/electra_model.py       |  1 -
 .../transformer_models/layoutlm_model.py      |  1 -
 .../transformer_models/mmbt_model.py          |  1 -
 simpletransformers/conv_ai/conv_ai_model.py   |  1 -
 simpletransformers/custom_models/models.py    |  4 -
 .../transformer_models/albert_model.py        |  1 -
 .../transformer_models/bert_model.py          |  1 -
 .../language_generation_model.py              |  2 -
 .../language_modeling_model.py                |  1 -
 .../representation_model.py                   |  1 -
 simpletransformers/losses/dice_loss.py        |  1 -
 simpletransformers/losses/tversky_loss.py     |  1 -
 simpletransformers/ner/ner_model.py           |  5 +-
 .../question_answering_model.py               |  1 -
 .../question_answering_utils.py               | 42 ++++----
 simpletransformers/seq2seq/seq2seq_model.py   | 19 ++--
 simpletransformers/t5/t5_model.py             | 10 +-
 simpletransformers/t5/t5_utils.py             |  2 +-
 27 files changed, 176 insertions(+), 126 deletions(-)

diff --git a/examples/t5/mixed_tasks/data_prep.ipynb b/examples/t5/mixed_tasks/data_prep.ipynb
index 31bf536e..8ece0cce 100644
--- a/examples/t5/mixed_tasks/data_prep.ipynb
+++ b/examples/t5/mixed_tasks/data_prep.ipynb
@@ -52,30 +52,34 @@
     }
    ],
    "source": [
-    "prefix = 'data/binary_classification/'\n",
+    "prefix = \"data/binary_classification/\"\n",
     "\n",
-    "binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)\n",
+    "binary_train_df = pd.read_csv(prefix + \"train.csv\", header=None)\n",
     "binary_train_df.head()\n",
     "\n",
-    "binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)\n",
+    "binary_eval_df = pd.read_csv(prefix + \"test.csv\", header=None)\n",
     "binary_eval_df.head()\n",
     "\n",
     "binary_train_df[0] = (binary_train_df[0] == 2).astype(int)\n",
     "binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)\n",
     "\n",
-    "binary_train_df = pd.DataFrame({\n",
-    "    'prefix': [\"binary classification\" for i in range(len(binary_train_df))],\n",
-    "    'input_text': binary_train_df[1].str.replace('\\n', ' '),\n",
-    "    'target_text': binary_train_df[0].astype(str),\n",
-    "})\n",
+    "binary_train_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"prefix\": [\"binary classification\" for i in range(len(binary_train_df))],\n",
+    "        \"input_text\": binary_train_df[1].str.replace(\"\\n\", \" \"),\n",
+    "        \"target_text\": binary_train_df[0].astype(str),\n",
+    "    }\n",
+    ")\n",
     "\n",
     "print(binary_train_df.head())\n",
     "\n",
-    "binary_eval_df = pd.DataFrame({\n",
-    "    'prefix': [\"binary classification\" for i in range(len(binary_eval_df))],\n",
-    "    'input_text': binary_eval_df[1].str.replace('\\n', ' '),\n",
-    "    'target_text': binary_eval_df[0].astype(str),\n",
-    "})\n",
+    "binary_eval_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"prefix\": [\"binary classification\" for i in range(len(binary_eval_df))],\n",
+    "        \"input_text\": binary_eval_df[1].str.replace(\"\\n\", \" \"),\n",
+    "        \"target_text\": binary_eval_df[0].astype(str),\n",
+    "    }\n",
+    ")\n",
     "\n",
     "\n",
     "print(binary_eval_df.head())"
@@ -171,16 +175,29 @@
    "source": [
     "prefix = \"data/multilabel_classification/\"\n",
     "\n",
-    "multi_train_df = pd.read_csv(prefix + 'train.csv')\n",
-    "multi_train_df[\"comment_text\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
+    "multi_train_df = pd.read_csv(prefix + \"train.csv\")\n",
+    "multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
     "\n",
     "for col in multi_train_df.columns:\n",
     "    if col not in [\"id\", \"comment_text\"]:\n",
     "        multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else \"\")\n",
     "\n",
-    "multi_train_df[\"target_text\"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in [\"id\", \"comment_text\", \"toxic\"]]], sep=',')\n",
-    "multi_train_df[\"target_text\"] = multi_train_df[\"target_text\"].apply(lambda x: \",\".join(word for word in x.split(\",\") if word)).apply(lambda x: x if x else \"clean\")\n",
-    "multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace('\\n', ' ')\n",
+    "multi_train_df[\"target_text\"] = multi_train_df[\"toxic\"].str.cat(\n",
+    "    multi_train_df[\n",
+    "        [\n",
+    "            col\n",
+    "            for col in multi_train_df.columns\n",
+    "            if col not in [\"id\", \"comment_text\", \"toxic\"]\n",
+    "        ]\n",
+    "    ],\n",
+    "    sep=\",\",\n",
+    ")\n",
+    "multi_train_df[\"target_text\"] = (\n",
+    "    multi_train_df[\"target_text\"]\n",
+    "    .apply(lambda x: \",\".join(word for word in x.split(\",\") if word))\n",
+    "    .apply(lambda x: x if x else \"clean\")\n",
+    ")\n",
+    "multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \")\n",
     "multi_train_df[\"prefix\"] = \"multilabel classification\"\n",
     "multi_train_df = multi_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
     "\n",
@@ -206,15 +223,25 @@
     }
    ],
    "source": [
-    "prefix = 'data/regression/'\n",
+    "prefix = \"data/regression/\"\n",
     "\n",
-    "sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
-    "sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
+    "sts_train_df = pd.read_csv(\n",
+    "    prefix + \"train.tsv\", sep=\"\\t\", error_bad_lines=False\n",
+    ").dropna()\n",
+    "sts_eval_df = pd.read_csv(prefix + \"dev.tsv\", sep=\"\\t\", error_bad_lines=False).dropna()\n",
     "\n",
-    "sts_train_df[\"sentence1\"] = sts_train_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
-    "sts_train_df[\"sentence2\"] = sts_train_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
-    "sts_eval_df[\"sentence1\"] = sts_eval_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
-    "sts_eval_df[\"sentence2\"] = sts_eval_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')"
+    "sts_train_df[\"sentence1\"] = (\n",
+    "    sts_train_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")\n",
+    "sts_train_df[\"sentence2\"] = (\n",
+    "    sts_train_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")\n",
+    "sts_eval_df[\"sentence1\"] = (\n",
+    "    sts_eval_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")\n",
+    "sts_eval_df[\"sentence2\"] = (\n",
+    "    sts_eval_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")"
    ]
   },
   {
@@ -223,7 +250,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
+    "sts_train_df.drop(2001, inplace=True)  # This line badly formatted. Getting rid."
    ]
   },
   {
@@ -232,11 +259,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sts_train_df[\"input_text\"] = sts_train_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
-    "sts_eval_df[\"input_text\"] = sts_eval_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
+    "sts_train_df[\"input_text\"] = sts_train_df.apply(\n",
+    "    lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
+    ")\n",
+    "sts_eval_df[\"input_text\"] = sts_eval_df.apply(\n",
+    "    lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
+    ")\n",
     "\n",
-    "sts_train_df[\"target_text\"] = sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
-    "sts_eval_df[\"target_text\"] = sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
+    "sts_train_df[\"target_text\"] = (\n",
+    "    sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
+    ")\n",
+    "sts_eval_df[\"target_text\"] = (\n",
+    "    sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
+    ")\n",
     "\n",
     "sts_train_df[\"prefix\"] = \"similarity\"\n",
     "sts_eval_df[\"prefix\"] = \"similarity\"\n",
diff --git a/examples/t5/mt5/data_prep.ipynb b/examples/t5/mt5/data_prep.ipynb
index 35b3d4fd..a069b747 100644
--- a/examples/t5/mt5/data_prep.ipynb
+++ b/examples/t5/mt5/data_prep.ipynb
@@ -37,30 +37,34 @@
     }
    ],
    "source": [
-    "prefix = 'data/binary_classification/'\n",
+    "prefix = \"data/binary_classification/\"\n",
     "\n",
-    "binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)\n",
+    "binary_train_df = pd.read_csv(prefix + \"train.csv\", header=None)\n",
     "binary_train_df.head()\n",
     "\n",
-    "binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)\n",
+    "binary_eval_df = pd.read_csv(prefix + \"test.csv\", header=None)\n",
     "binary_eval_df.head()\n",
     "\n",
     "binary_train_df[0] = (binary_train_df[0] == 2).astype(int)\n",
     "binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)\n",
     "\n",
-    "binary_train_df = pd.DataFrame({\n",
-    "    'prefix': [\"binary classification\" for i in range(len(binary_train_df))],\n",
-    "    'input_text': binary_train_df[1].str.replace('\\n', ' '),\n",
-    "    'target_text': binary_train_df[0].astype(str),\n",
-    "})\n",
+    "binary_train_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"prefix\": [\"binary classification\" for i in range(len(binary_train_df))],\n",
+    "        \"input_text\": binary_train_df[1].str.replace(\"\\n\", \" \"),\n",
+    "        \"target_text\": binary_train_df[0].astype(str),\n",
+    "    }\n",
+    ")\n",
     "\n",
     "print(binary_train_df.head())\n",
     "\n",
-    "binary_eval_df = pd.DataFrame({\n",
-    "    'prefix': [\"binary classification\" for i in range(len(binary_eval_df))],\n",
-    "    'input_text': binary_eval_df[1].str.replace('\\n', ' '),\n",
-    "    'target_text': binary_eval_df[0].astype(str),\n",
-    "})\n",
+    "binary_eval_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"prefix\": [\"binary classification\" for i in range(len(binary_eval_df))],\n",
+    "        \"input_text\": binary_eval_df[1].str.replace(\"\\n\", \" \"),\n",
+    "        \"target_text\": binary_eval_df[0].astype(str),\n",
+    "    }\n",
+    ")\n",
     "\n",
     "\n",
     "print(binary_eval_df.head())"
@@ -156,16 +160,29 @@
    "source": [
     "prefix = \"data/multilabel_classification/\"\n",
     "\n",
-    "multi_train_df = pd.read_csv(prefix + 'train.csv')\n",
-    "multi_train_df[\"comment_text\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
+    "multi_train_df = pd.read_csv(prefix + \"train.csv\")\n",
+    "multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
     "\n",
     "for col in multi_train_df.columns:\n",
     "    if col not in [\"id\", \"comment_text\"]:\n",
     "        multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else \"\")\n",
     "\n",
-    "multi_train_df[\"target_text\"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in [\"id\", \"comment_text\", \"toxic\"]]], sep=',')\n",
-    "multi_train_df[\"target_text\"] = multi_train_df[\"target_text\"].apply(lambda x: \",\".join(word for word in x.split(\",\") if word)).apply(lambda x: x if x else \"clean\")\n",
-    "multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace('\\n', ' ')\n",
+    "multi_train_df[\"target_text\"] = multi_train_df[\"toxic\"].str.cat(\n",
+    "    multi_train_df[\n",
+    "        [\n",
+    "            col\n",
+    "            for col in multi_train_df.columns\n",
+    "            if col not in [\"id\", \"comment_text\", \"toxic\"]\n",
+    "        ]\n",
+    "    ],\n",
+    "    sep=\",\",\n",
+    ")\n",
+    "multi_train_df[\"target_text\"] = (\n",
+    "    multi_train_df[\"target_text\"]\n",
+    "    .apply(lambda x: \",\".join(word for word in x.split(\",\") if word))\n",
+    "    .apply(lambda x: x if x else \"clean\")\n",
+    ")\n",
+    "multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \")\n",
     "multi_train_df[\"prefix\"] = \"multilabel classification\"\n",
     "multi_train_df = multi_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
     "\n",
@@ -191,15 +208,25 @@
     }
    ],
    "source": [
-    "prefix = 'data/regression/'\n",
+    "prefix = \"data/regression/\"\n",
     "\n",
-    "sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
-    "sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
+    "sts_train_df = pd.read_csv(\n",
+    "    prefix + \"train.tsv\", sep=\"\\t\", error_bad_lines=False\n",
+    ").dropna()\n",
+    "sts_eval_df = pd.read_csv(prefix + \"dev.tsv\", sep=\"\\t\", error_bad_lines=False).dropna()\n",
     "\n",
-    "sts_train_df[\"sentence1\"] = sts_train_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
-    "sts_train_df[\"sentence2\"] = sts_train_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
-    "sts_eval_df[\"sentence1\"] = sts_eval_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
-    "sts_eval_df[\"sentence2\"] = sts_eval_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')"
+    "sts_train_df[\"sentence1\"] = (\n",
+    "    sts_train_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")\n",
+    "sts_train_df[\"sentence2\"] = (\n",
+    "    sts_train_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")\n",
+    "sts_eval_df[\"sentence1\"] = (\n",
+    "    sts_eval_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")\n",
+    "sts_eval_df[\"sentence2\"] = (\n",
+    "    sts_eval_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
+    ")"
    ]
   },
   {
@@ -208,7 +235,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
+    "sts_train_df.drop(2001, inplace=True)  # This line badly formatted. Getting rid."
    ]
   },
   {
@@ -217,11 +244,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sts_train_df[\"input_text\"] = sts_train_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
-    "sts_eval_df[\"input_text\"] = sts_eval_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
+    "sts_train_df[\"input_text\"] = sts_train_df.apply(\n",
+    "    lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
+    ")\n",
+    "sts_eval_df[\"input_text\"] = sts_eval_df.apply(\n",
+    "    lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
+    ")\n",
     "\n",
-    "sts_train_df[\"target_text\"] = sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
-    "sts_eval_df[\"target_text\"] = sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
+    "sts_train_df[\"target_text\"] = (\n",
+    "    sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
+    ")\n",
+    "sts_eval_df[\"target_text\"] = (\n",
+    "    sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
+    ")\n",
     "\n",
     "sts_train_df[\"prefix\"] = \"similarity\"\n",
     "sts_eval_df[\"prefix\"] = \"similarity\"\n",
diff --git a/setup.py b/setup.py
index f35ab13f..c996665a 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="simpletransformers",
-    version="0.64.3",
+    version="0.64.5",
     author="Thilina Rajapakse",
     author_email="chaturangarajapakshe@gmail.com",
     description="An easy-to-use wrapper library for the Transformers library.",
@@ -32,6 +32,7 @@
         "scikit-learn",
         "seqeval",
         "tensorboard",
+        "tensorboardx",
         "pandas",
         "tokenizers",
         "wandb>=0.10.32",
diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py
index c3f3d18c..62ef9ea8 100755
--- a/simpletransformers/classification/classification_model.py
+++ b/simpletransformers/classification/classification_model.py
@@ -182,7 +182,6 @@ def __init__(
         onnx_execution_provider=None,
         **kwargs,
     ):
-
         """
         Initializes a ClassificationModel model.
 
@@ -1641,10 +1640,14 @@ def evaluate(
 
             if not self.args.sliding_window:
                 # ROC`
-                wandb.log({"roc": wandb.plot.roc_curve(truth, model_outputs, labels_list)})
+                wandb.log(
+                    {"roc": wandb.plot.roc_curve(truth, model_outputs, labels_list)}
+                )
 
                 # Precision Recall
-                wandb.log({"pr": wandb.plot.pr_curve(truth, model_outputs, labels_list)})
+                wandb.log(
+                    {"pr": wandb.plot.pr_curve(truth, model_outputs, labels_list)}
+                )
 
         return results, model_outputs, wrong
 
diff --git a/simpletransformers/classification/classification_utils.py b/simpletransformers/classification/classification_utils.py
index f2fc22f8..74be2a6f 100755
--- a/simpletransformers/classification/classification_utils.py
+++ b/simpletransformers/classification/classification_utils.py
@@ -803,7 +803,6 @@ def __init__(
         data_type_extension=None,
         multi_label=False,
     ):
-
         self.text_label = text_label if text_label else "text"
         self.labels_label = labels_label if labels_label else "labels"
         self.images_label = images_label if images_label else "images"
diff --git a/simpletransformers/classification/multi_label_classification_model.py b/simpletransformers/classification/multi_label_classification_model.py
index e067d507..f5a56135 100755
--- a/simpletransformers/classification/multi_label_classification_model.py
+++ b/simpletransformers/classification/multi_label_classification_model.py
@@ -89,7 +89,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a MultiLabelClassification model.
 
diff --git a/simpletransformers/classification/multi_modal_classification_model.py b/simpletransformers/classification/multi_modal_classification_model.py
index cbe6b252..c9b51fa9 100644
--- a/simpletransformers/classification/multi_modal_classification_model.py
+++ b/simpletransformers/classification/multi_modal_classification_model.py
@@ -86,7 +86,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a MultiModalClassificationModel model.
 
diff --git a/simpletransformers/classification/transformer_models/albert_model.py b/simpletransformers/classification/transformer_models/albert_model.py
index fe5c0aa3..d869ad39 100755
--- a/simpletransformers/classification/transformer_models/albert_model.py
+++ b/simpletransformers/classification/transformer_models/albert_model.py
@@ -56,7 +56,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.albert(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/simpletransformers/classification/transformer_models/bert_model.py b/simpletransformers/classification/transformer_models/bert_model.py
index c2e3b97a..bfaa4f6a 100755
--- a/simpletransformers/classification/transformer_models/bert_model.py
+++ b/simpletransformers/classification/transformer_models/bert_model.py
@@ -53,7 +53,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
diff --git a/simpletransformers/classification/transformer_models/electra_model.py b/simpletransformers/classification/transformer_models/electra_model.py
index b414b246..476640cb 100755
--- a/simpletransformers/classification/transformer_models/electra_model.py
+++ b/simpletransformers/classification/transformer_models/electra_model.py
@@ -55,7 +55,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         discriminator_hidden_states = self.electra(
             input_ids,
             attention_mask,
diff --git a/simpletransformers/classification/transformer_models/layoutlm_model.py b/simpletransformers/classification/transformer_models/layoutlm_model.py
index c3c5ae50..b9f41286 100644
--- a/simpletransformers/classification/transformer_models/layoutlm_model.py
+++ b/simpletransformers/classification/transformer_models/layoutlm_model.py
@@ -27,7 +27,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.bert(
             input_ids=input_ids,
             bbox=bbox,
diff --git a/simpletransformers/classification/transformer_models/mmbt_model.py b/simpletransformers/classification/transformer_models/mmbt_model.py
index 83e7011d..4c88d04c 100644
--- a/simpletransformers/classification/transformer_models/mmbt_model.py
+++ b/simpletransformers/classification/transformer_models/mmbt_model.py
@@ -55,7 +55,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.mmbt(
             input_modal=input_modal,
             input_ids=input_ids,
diff --git a/simpletransformers/conv_ai/conv_ai_model.py b/simpletransformers/conv_ai/conv_ai_model.py
index 09e157a5..f79307c3 100644
--- a/simpletransformers/conv_ai/conv_ai_model.py
+++ b/simpletransformers/conv_ai/conv_ai_model.py
@@ -95,7 +95,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a ClassificationModel model.
 
diff --git a/simpletransformers/custom_models/models.py b/simpletransformers/custom_models/models.py
index 324ad84a..dcdc44da 100755
--- a/simpletransformers/custom_models/models.py
+++ b/simpletransformers/custom_models/models.py
@@ -516,7 +516,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.albert(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -791,7 +790,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.electra(
             input_ids,
             attention_mask,
@@ -848,7 +846,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         outputs = self.electra(
             input_ids,
             attention_mask,
@@ -903,7 +900,6 @@ def forward(
         start_positions=None,
         end_positions=None,
     ):
-
         outputs = self.electra(
             input_ids,
             attention_mask,
diff --git a/simpletransformers/experimental/classification/transformer_models/albert_model.py b/simpletransformers/experimental/classification/transformer_models/albert_model.py
index 2f074995..578b2902 100755
--- a/simpletransformers/experimental/classification/transformer_models/albert_model.py
+++ b/simpletransformers/experimental/classification/transformer_models/albert_model.py
@@ -58,7 +58,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         all_outputs = []
         if self.sliding_window:
             # input_ids is really the list of inputs for each "sequence window"
diff --git a/simpletransformers/experimental/classification/transformer_models/bert_model.py b/simpletransformers/experimental/classification/transformer_models/bert_model.py
index 60f162db..b950af44 100755
--- a/simpletransformers/experimental/classification/transformer_models/bert_model.py
+++ b/simpletransformers/experimental/classification/transformer_models/bert_model.py
@@ -54,7 +54,6 @@ def forward(
         inputs_embeds=None,
         labels=None,
     ):
-
         all_outputs = []
         if self.sliding_window:
             # input_ids is really the list of inputs for each "sequence window"
diff --git a/simpletransformers/language_generation/language_generation_model.py b/simpletransformers/language_generation/language_generation_model.py
index 94a73ea3..89a9b9b1 100644
--- a/simpletransformers/language_generation/language_generation_model.py
+++ b/simpletransformers/language_generation/language_generation_model.py
@@ -49,7 +49,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a LanguageGenerationModel model.
 
@@ -147,7 +146,6 @@ def __init__(
         self.model.to(self.device)
 
     def generate(self, prompt=None, args=None, verbose=True):
-
         """
         Generate text using a LanguageGenerationModel
 
diff --git a/simpletransformers/language_modeling/language_modeling_model.py b/simpletransformers/language_modeling/language_modeling_model.py
index 3984f7e6..179464b6 100755
--- a/simpletransformers/language_modeling/language_modeling_model.py
+++ b/simpletransformers/language_modeling/language_modeling_model.py
@@ -154,7 +154,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a LanguageModelingModel.
 
diff --git a/simpletransformers/language_representation/representation_model.py b/simpletransformers/language_representation/representation_model.py
index 10ef94dd..7d9185b4 100644
--- a/simpletransformers/language_representation/representation_model.py
+++ b/simpletransformers/language_representation/representation_model.py
@@ -73,7 +73,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a RepresentationModel model.
 
diff --git a/simpletransformers/losses/dice_loss.py b/simpletransformers/losses/dice_loss.py
index 0a3f513c..d96227e2 100644
--- a/simpletransformers/losses/dice_loss.py
+++ b/simpletransformers/losses/dice_loss.py
@@ -64,7 +64,6 @@ def __init__(
         self.smooth: float = smooth
 
     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-
         if len(input.shape) == 2:
             if input.shape[0] != target.shape[0]:
                 raise ValueError(
diff --git a/simpletransformers/losses/tversky_loss.py b/simpletransformers/losses/tversky_loss.py
index 6658308d..8bb52e7d 100644
--- a/simpletransformers/losses/tversky_loss.py
+++ b/simpletransformers/losses/tversky_loss.py
@@ -70,7 +70,6 @@ def __init__(
         self.smooth: float = smooth
 
     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-
         if len(input.shape) == 2:
             if input.shape[0] != target.shape[0]:
                 raise ValueError(
diff --git a/simpletransformers/ner/ner_model.py b/simpletransformers/ner/ner_model.py
index e09f62cb..7ee94e2a 100755
--- a/simpletransformers/ner/ner_model.py
+++ b/simpletransformers/ner/ner_model.py
@@ -228,7 +228,7 @@ def __init__(
                 LukeConfig,
                 LukeForTokenClassification,
                 MLukeTokenizer,
-            ),            
+            ),
             "mobilebert": (
                 MobileBertConfig,
                 MobileBertForTokenClassification,
@@ -868,7 +868,6 @@ def train(
                         args.evaluate_during_training_steps > 0
                         and global_step % args.evaluate_during_training_steps == 0
                     ):
-
                         output_dir_current = os.path.join(
                             output_dir, "checkpoint-{}".format(global_step)
                         )
@@ -1501,7 +1500,6 @@ def predict(self, to_predict, split_on_space=True):
                 ]
 
         if self.args.onnx:
-
             # Encode
             model_inputs = self.tokenizer.batch_encode_plus(
                 to_predict,
@@ -1751,7 +1749,6 @@ def predict(self, to_predict, split_on_space=True):
     def _convert_tokens_to_word_logits(
         self, input_ids, label_ids, attention_mask, logits
     ):
-
         ignore_ids = [
             self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token),
             self.tokenizer.convert_tokens_to_ids(self.tokenizer.sep_token),
diff --git a/simpletransformers/question_answering/question_answering_model.py b/simpletransformers/question_answering/question_answering_model.py
index 70260fcc..849b190e 100755
--- a/simpletransformers/question_answering/question_answering_model.py
+++ b/simpletransformers/question_answering/question_answering_model.py
@@ -118,7 +118,6 @@ class QuestionAnsweringModel:
     def __init__(
         self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs
     ):
-
         """
         Initializes a QuestionAnsweringModel model.
 
diff --git a/simpletransformers/question_answering/question_answering_utils.py b/simpletransformers/question_answering/question_answering_utils.py
index 778a14a1..e069920f 100755
--- a/simpletransformers/question_answering/question_answering_utils.py
+++ b/simpletransformers/question_answering/question_answering_utils.py
@@ -203,7 +203,7 @@ def convert_example_to_feature(example_row):
     tok_to_orig_index = []
     orig_to_tok_index = []
     all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
+    for i, token in enumerate(example.doc_tokens):
         orig_to_tok_index.append(len(all_doc_tokens))
         sub_tokens = tokenizer.tokenize(token)
         for sub_token in sub_tokens:
@@ -249,7 +249,7 @@ def convert_example_to_feature(example_row):
             break
         start_offset += min(length, doc_stride)
 
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
+    for doc_span_index, doc_span in enumerate(doc_spans):
         tokens = []
         token_to_orig_map = {}
         token_is_max_context = {}
@@ -632,8 +632,7 @@ def convert_examples_to_features(
             )
     else:
         features = []
-        for (example_index, example) in enumerate(tqdm(examples, disable=silent)):
-
+        for example_index, example in enumerate(tqdm(examples, disable=silent)):
             # if example_index % 100 == 0:
             #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
 
@@ -645,7 +644,7 @@ def convert_examples_to_features(
             tok_to_orig_index = []
             orig_to_tok_index = []
             all_doc_tokens = []
-            for (i, token) in enumerate(example.doc_tokens):
+            for i, token in enumerate(example.doc_tokens):
                 orig_to_tok_index.append(len(all_doc_tokens))
                 sub_tokens = tokenizer.tokenize(token)
                 for sub_token in sub_tokens:
@@ -691,7 +690,7 @@ def convert_examples_to_features(
                     break
                 start_offset += min(length, doc_stride)
 
-            for (doc_span_index, doc_span) in enumerate(doc_spans):
+            for doc_span_index, doc_span in enumerate(doc_spans):
                 tokens = []
                 token_to_orig_map = {}
                 token_is_max_context = {}
@@ -891,7 +890,7 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
     # and 0 right context.
     best_score = None
     best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
+    for span_index, doc_span in enumerate(doc_spans):
         end = doc_span.start + doc_span.length - 1
         if position < doc_span.start:
             continue
@@ -947,7 +946,7 @@ def write_predictions(
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
 
-    for (example_index, example) in enumerate(all_examples):
+    for example_index, example in enumerate(all_examples):
         features = example_index_to_features[example_index]
 
         prelim_predictions = []
@@ -956,7 +955,7 @@ def write_predictions(
         min_null_feature_index = 0  # the paragraph slice with min null score
         null_start_logit = 0  # the start logit at the slice with min null score
         null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
+        for feature_index, feature in enumerate(features):
             result = unique_id_to_result[feature.unique_id]
             start_indexes = _get_best_indexes(result.start_logits, n_best_size)
             end_indexes = _get_best_indexes(result.end_logits, n_best_size)
@@ -1091,7 +1090,7 @@ def write_predictions(
         probs = _compute_softmax(total_scores)
 
         nbest_json = []
-        for (i, entry) in enumerate(nbest):
+        for i, entry in enumerate(nbest):
             output = collections.OrderedDict()
             output["text"] = entry.text
             output["probability"] = probs[i]
@@ -1188,14 +1187,14 @@ def write_predictions_extended(
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
 
-    for (example_index, example) in enumerate(all_examples):
+    for example_index, example in enumerate(all_examples):
         features = example_index_to_features[example_index]
 
         prelim_predictions = []
         # keep track of the minimum score of null start+end of position 0
         score_null = 1000000  # large and positive
 
-        for (feature_index, feature) in enumerate(features):
+        for feature_index, feature in enumerate(features):
             result = unique_id_to_result[feature.unique_id]
 
             cur_null_score = result.cls_logits
@@ -1306,7 +1305,7 @@ def write_predictions_extended(
         probs = _compute_softmax(total_scores)
 
         nbest_json = []
-        for (i, entry) in enumerate(nbest):
+        for i, entry in enumerate(nbest):
             output = collections.OrderedDict()
             output["text"] = entry.text
             output["probability"] = probs[i]
@@ -1363,7 +1362,6 @@ def get_best_predictions(
     version_2_with_negative,
     null_score_diff_threshold,
 ):
-
     example_index_to_features = collections.defaultdict(list)
     for feature in all_features:
         example_index_to_features[feature.example_index].append(feature)
@@ -1381,7 +1379,7 @@ def get_best_predictions(
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
 
-    for (example_index, example) in enumerate(all_examples):
+    for example_index, example in enumerate(all_examples):
         features = example_index_to_features[example_index]
 
         prelim_predictions = []
@@ -1390,7 +1388,7 @@ def get_best_predictions(
         min_null_feature_index = 0  # the paragraph slice with min null score
         null_start_logit = 0  # the start logit at the slice with min null score
         null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
+        for feature_index, feature in enumerate(features):
             result = unique_id_to_result[feature.unique_id]
             start_indexes = _get_best_indexes(result.start_logits, n_best_size)
             end_indexes = _get_best_indexes(result.end_logits, n_best_size)
@@ -1525,7 +1523,7 @@ def get_best_predictions(
         probs = _compute_softmax(total_scores)
 
         nbest_json = []
-        for (i, entry) in enumerate(nbest):
+        for i, entry in enumerate(nbest):
             output = collections.OrderedDict()
             output["text"] = entry.text
             output["probability"] = probs[i]
@@ -1599,14 +1597,14 @@ def get_best_predictions_extended(
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
 
-    for (example_index, example) in enumerate(all_examples):
+    for example_index, example in enumerate(all_examples):
         features = example_index_to_features[example_index]
 
         prelim_predictions = []
         # keep track of the minimum score of null start+end of position 0
         score_null = 1000000  # large and positive
 
-        for (feature_index, feature) in enumerate(features):
+        for feature_index, feature in enumerate(features):
             result = unique_id_to_result[feature.unique_id]
 
             cur_null_score = result.cls_logits
@@ -1722,7 +1720,7 @@ def get_best_predictions_extended(
         probs = _compute_softmax(total_scores)
 
         nbest_json = []
-        for (i, entry) in enumerate(nbest):
+        for i, entry in enumerate(nbest):
             output = collections.OrderedDict()
             output["text"] = entry.text
             output["probability"] = probs[i]
@@ -1914,7 +1912,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     def _strip_spaces(text):
         ns_chars = []
         ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
+        for i, c in enumerate(text):
             if c == " ":
                 continue
             ns_to_s_map[len(ns_chars)] = i
@@ -1952,7 +1950,7 @@ def _strip_spaces(text):
     # We then project the characters in `pred_text` back to `orig_text` using
     # the character-to-character alignment.
     tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
+    for i, tok_index in tok_ns_to_s_map.items():
         tok_s_to_ns_map[tok_index] = i
 
     orig_start_position = None
diff --git a/simpletransformers/seq2seq/seq2seq_model.py b/simpletransformers/seq2seq/seq2seq_model.py
index a2a6209f..17a2fa3c 100644
--- a/simpletransformers/seq2seq/seq2seq_model.py
+++ b/simpletransformers/seq2seq/seq2seq_model.py
@@ -138,7 +138,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a Seq2SeqModel.
 
@@ -312,7 +311,7 @@ def __init__(
 
             if encoder_decoder_type in ["bart", "mbart", "mbart50", "marian"]:
                 self.model = model_class.from_pretrained(encoder_decoder_name)
-                if encoder_decoder_type in ["bart", "mbart","mbart50"]:
+                if encoder_decoder_type in ["bart", "mbart", "mbart50"]:
                     self.encoder_tokenizer = tokenizer_class.from_pretrained(
                         encoder_decoder_name
                     )
@@ -1316,7 +1315,6 @@ def predict(self, to_predict):
                     num_return_sequences=self.args.num_return_sequences,
                 )
             elif self.args.model_type in ["mbart"]:
-                
                 # tgt_lang_token = self.decoder_tokenizer._convert_token_to_id_with_added_voc(
                 #     self.args.tgt_lang
                 # )
@@ -1324,7 +1322,9 @@ def predict(self, to_predict):
                 outputs = self.model.generate(
                     input_ids=input_ids,
                     # decoder_start_token_id=tgt_lang_token,
-                    decoder_start_token_id=self.decoder_tokenizer.lang_code_to_id[self.args.tgt_lang],
+                    decoder_start_token_id=self.decoder_tokenizer.lang_code_to_id[
+                        self.args.tgt_lang
+                    ],
                     num_beams=self.args.num_beams,
                     # max_length=self.args.max_length,
                     max_new_tokens=self.args.max_length,
@@ -1337,7 +1337,6 @@ def predict(self, to_predict):
                     num_return_sequences=self.args.num_return_sequences,
                 )
             elif self.args.model_type in ["mbart50"]:
-                
                 # tgt_lang_token = self.decoder_tokenizer._convert_token_to_id_with_added_voc(
                 #     self.args.tgt_lang
                 # )
@@ -1345,7 +1344,9 @@ def predict(self, to_predict):
                 outputs = self.model.generate(
                     input_ids=input_ids,
                     # decoder_start_token_id=tgt_lang_token,
-                    decoder_start_token_id=self.decoder_tokenizer.lang_code_to_id[self.args.tgt_lang],
+                    decoder_start_token_id=self.decoder_tokenizer.lang_code_to_id[
+                        self.args.tgt_lang
+                    ],
                     num_beams=self.args.num_beams,
                     # max_length=self.args.max_length,
                     max_new_tokens=self.args.max_length,
@@ -1356,7 +1357,9 @@ def predict(self, to_predict):
                     top_k=self.args.top_k,
                     top_p=self.args.top_p,
                     num_return_sequences=self.args.num_return_sequences,
-                    forced_bos_token_id=self.decoder_tokenizer.lang_code_to_id[self.args.tgt_lang],
+                    forced_bos_token_id=self.decoder_tokenizer.lang_code_to_id[
+                        self.args.tgt_lang
+                    ],
                 )
             elif self.args.model_type in ["rag-token", "rag-sequence"]:
                 outputs = self.model.generate(
@@ -1659,7 +1662,7 @@ def _get_inputs_dict(self, batch):
                 "attention_mask": source_mask.to(device),
                 "labels": labels.to(device),
             }
-        elif self.args.model_type in ["mbart","mbart50"]:
+        elif self.args.model_type in ["mbart", "mbart50"]:
             inputs = {
                 "input_ids": batch["input_ids"].to(device),
                 "attention_mask": batch["attention_mask"].to(device),
diff --git a/simpletransformers/t5/t5_model.py b/simpletransformers/t5/t5_model.py
index 2552f1dc..62d8af2d 100644
--- a/simpletransformers/t5/t5_model.py
+++ b/simpletransformers/t5/t5_model.py
@@ -70,7 +70,6 @@ def __init__(
         cuda_device=-1,
         **kwargs,
     ):
-
         """
         Initializes a T5Model model.
 
@@ -922,6 +921,9 @@ def eval_model(
         self.results.update(result)
 
         if self.args.evaluate_generated_text:
+            raise ValueError(
+                "evaluate_generated_text not implemented without use_hf_datasets."
+            )
             if self.args.preprocess_inputs:
                 to_predict = [
                     prefix + ": " + input_text
@@ -943,9 +945,7 @@ def eval_model(
             else:
                 target_text = eval_dataset["target_text"].tolist()
 
-            result = self.compute_metrics(
-                target_text, preds, **kwargs
-            )
+            result = self.compute_metrics(target_text, preds, **kwargs)
             self.results.update(result)
 
         if verbose:
@@ -1134,7 +1134,7 @@ def _move_model_to_device(self):
 
     def _get_inputs_dict(self, batch):
         if self.args.use_hf_datasets:
-            inputs = {**batch, "labels": batch["input_ids"]}
+            inputs = {**batch}
 
             return {key: value.to(self.device) for key, value in inputs.items()}
         else:
diff --git a/simpletransformers/t5/t5_utils.py b/simpletransformers/t5/t5_utils.py
index 0a3feb69..42fb9908 100644
--- a/simpletransformers/t5/t5_utils.py
+++ b/simpletransformers/t5/t5_utils.py
@@ -84,7 +84,7 @@ def load_hf_dataset(data, tokenizer, args):
         batched=True,
     )
 
-    dataset.set_format(type="pt", columns=["input_ids", "attention_mask"])
+    dataset.set_format(type="pt", columns=["input_ids", "attention_mask", "labels"])
 
     if isinstance(data, str):
         # This is not necessarily a train dataset. The datasets library insists on calling it train.