Skip to content

Commit

Permalink
Bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
Thilina Rajapakse committed Dec 18, 2023
1 parent 76d9801 commit 5219fe7
Show file tree
Hide file tree
Showing 27 changed files with 176 additions and 126 deletions.
95 changes: 65 additions & 30 deletions examples/t5/mixed_tasks/data_prep.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -52,30 +52,34 @@
}
],
"source": [
"prefix = 'data/binary_classification/'\n",
"prefix = \"data/binary_classification/\"\n",
"\n",
"binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)\n",
"binary_train_df = pd.read_csv(prefix + \"train.csv\", header=None)\n",
"binary_train_df.head()\n",
"\n",
"binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)\n",
"binary_eval_df = pd.read_csv(prefix + \"test.csv\", header=None)\n",
"binary_eval_df.head()\n",
"\n",
"binary_train_df[0] = (binary_train_df[0] == 2).astype(int)\n",
"binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)\n",
"\n",
"binary_train_df = pd.DataFrame({\n",
" 'prefix': [\"binary classification\" for i in range(len(binary_train_df))],\n",
" 'input_text': binary_train_df[1].str.replace('\\n', ' '),\n",
" 'target_text': binary_train_df[0].astype(str),\n",
"})\n",
"binary_train_df = pd.DataFrame(\n",
" {\n",
" \"prefix\": [\"binary classification\" for i in range(len(binary_train_df))],\n",
" \"input_text\": binary_train_df[1].str.replace(\"\\n\", \" \"),\n",
" \"target_text\": binary_train_df[0].astype(str),\n",
" }\n",
")\n",
"\n",
"print(binary_train_df.head())\n",
"\n",
"binary_eval_df = pd.DataFrame({\n",
" 'prefix': [\"binary classification\" for i in range(len(binary_eval_df))],\n",
" 'input_text': binary_eval_df[1].str.replace('\\n', ' '),\n",
" 'target_text': binary_eval_df[0].astype(str),\n",
"})\n",
"binary_eval_df = pd.DataFrame(\n",
" {\n",
" \"prefix\": [\"binary classification\" for i in range(len(binary_eval_df))],\n",
" \"input_text\": binary_eval_df[1].str.replace(\"\\n\", \" \"),\n",
" \"target_text\": binary_eval_df[0].astype(str),\n",
" }\n",
")\n",
"\n",
"\n",
"print(binary_eval_df.head())"
Expand Down Expand Up @@ -171,16 +175,29 @@
"source": [
"prefix = \"data/multilabel_classification/\"\n",
"\n",
"multi_train_df = pd.read_csv(prefix + 'train.csv')\n",
"multi_train_df[\"comment_text\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"multi_train_df = pd.read_csv(prefix + \"train.csv\")\n",
"multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
"\n",
"for col in multi_train_df.columns:\n",
" if col not in [\"id\", \"comment_text\"]:\n",
" multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else \"\")\n",
"\n",
"multi_train_df[\"target_text\"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in [\"id\", \"comment_text\", \"toxic\"]]], sep=',')\n",
"multi_train_df[\"target_text\"] = multi_train_df[\"target_text\"].apply(lambda x: \",\".join(word for word in x.split(\",\") if word)).apply(lambda x: x if x else \"clean\")\n",
"multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace('\\n', ' ')\n",
"multi_train_df[\"target_text\"] = multi_train_df[\"toxic\"].str.cat(\n",
" multi_train_df[\n",
" [\n",
" col\n",
" for col in multi_train_df.columns\n",
" if col not in [\"id\", \"comment_text\", \"toxic\"]\n",
" ]\n",
" ],\n",
" sep=\",\",\n",
")\n",
"multi_train_df[\"target_text\"] = (\n",
" multi_train_df[\"target_text\"]\n",
" .apply(lambda x: \",\".join(word for word in x.split(\",\") if word))\n",
" .apply(lambda x: x if x else \"clean\")\n",
")\n",
"multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \")\n",
"multi_train_df[\"prefix\"] = \"multilabel classification\"\n",
"multi_train_df = multi_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
"\n",
Expand All @@ -206,15 +223,25 @@
}
],
"source": [
"prefix = 'data/regression/'\n",
"prefix = \"data/regression/\"\n",
"\n",
"sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
"sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
"sts_train_df = pd.read_csv(\n",
" prefix + \"train.tsv\", sep=\"\\t\", error_bad_lines=False\n",
").dropna()\n",
"sts_eval_df = pd.read_csv(prefix + \"dev.tsv\", sep=\"\\t\", error_bad_lines=False).dropna()\n",
"\n",
"sts_train_df[\"sentence1\"] = sts_train_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"sts_train_df[\"sentence2\"] = sts_train_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"sts_eval_df[\"sentence1\"] = sts_eval_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"sts_eval_df[\"sentence2\"] = sts_eval_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')"
"sts_train_df[\"sentence1\"] = (\n",
" sts_train_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")\n",
"sts_train_df[\"sentence2\"] = (\n",
" sts_train_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")\n",
"sts_eval_df[\"sentence1\"] = (\n",
" sts_eval_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")\n",
"sts_eval_df[\"sentence2\"] = (\n",
" sts_eval_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")"
]
},
{
Expand All @@ -223,7 +250,7 @@
"metadata": {},
"outputs": [],
"source": [
"sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
"sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
]
},
{
Expand All @@ -232,11 +259,19 @@
"metadata": {},
"outputs": [],
"source": [
"sts_train_df[\"input_text\"] = sts_train_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
"sts_eval_df[\"input_text\"] = sts_eval_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
"sts_train_df[\"input_text\"] = sts_train_df.apply(\n",
" lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
")\n",
"sts_eval_df[\"input_text\"] = sts_eval_df.apply(\n",
" lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
")\n",
"\n",
"sts_train_df[\"target_text\"] = sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
"sts_eval_df[\"target_text\"] = sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
"sts_train_df[\"target_text\"] = (\n",
" sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
")\n",
"sts_eval_df[\"target_text\"] = (\n",
" sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
")\n",
"\n",
"sts_train_df[\"prefix\"] = \"similarity\"\n",
"sts_eval_df[\"prefix\"] = \"similarity\"\n",
Expand Down
95 changes: 65 additions & 30 deletions examples/t5/mt5/data_prep.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,34 @@
}
],
"source": [
"prefix = 'data/binary_classification/'\n",
"prefix = \"data/binary_classification/\"\n",
"\n",
"binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)\n",
"binary_train_df = pd.read_csv(prefix + \"train.csv\", header=None)\n",
"binary_train_df.head()\n",
"\n",
"binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)\n",
"binary_eval_df = pd.read_csv(prefix + \"test.csv\", header=None)\n",
"binary_eval_df.head()\n",
"\n",
"binary_train_df[0] = (binary_train_df[0] == 2).astype(int)\n",
"binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)\n",
"\n",
"binary_train_df = pd.DataFrame({\n",
" 'prefix': [\"binary classification\" for i in range(len(binary_train_df))],\n",
" 'input_text': binary_train_df[1].str.replace('\\n', ' '),\n",
" 'target_text': binary_train_df[0].astype(str),\n",
"})\n",
"binary_train_df = pd.DataFrame(\n",
" {\n",
" \"prefix\": [\"binary classification\" for i in range(len(binary_train_df))],\n",
" \"input_text\": binary_train_df[1].str.replace(\"\\n\", \" \"),\n",
" \"target_text\": binary_train_df[0].astype(str),\n",
" }\n",
")\n",
"\n",
"print(binary_train_df.head())\n",
"\n",
"binary_eval_df = pd.DataFrame({\n",
" 'prefix': [\"binary classification\" for i in range(len(binary_eval_df))],\n",
" 'input_text': binary_eval_df[1].str.replace('\\n', ' '),\n",
" 'target_text': binary_eval_df[0].astype(str),\n",
"})\n",
"binary_eval_df = pd.DataFrame(\n",
" {\n",
" \"prefix\": [\"binary classification\" for i in range(len(binary_eval_df))],\n",
" \"input_text\": binary_eval_df[1].str.replace(\"\\n\", \" \"),\n",
" \"target_text\": binary_eval_df[0].astype(str),\n",
" }\n",
")\n",
"\n",
"\n",
"print(binary_eval_df.head())"
Expand Down Expand Up @@ -156,16 +160,29 @@
"source": [
"prefix = \"data/multilabel_classification/\"\n",
"\n",
"multi_train_df = pd.read_csv(prefix + 'train.csv')\n",
"multi_train_df[\"comment_text\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"multi_train_df = pd.read_csv(prefix + \"train.csv\")\n",
"multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
"\n",
"for col in multi_train_df.columns:\n",
" if col not in [\"id\", \"comment_text\"]:\n",
" multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else \"\")\n",
"\n",
"multi_train_df[\"target_text\"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in [\"id\", \"comment_text\", \"toxic\"]]], sep=',')\n",
"multi_train_df[\"target_text\"] = multi_train_df[\"target_text\"].apply(lambda x: \",\".join(word for word in x.split(\",\") if word)).apply(lambda x: x if x else \"clean\")\n",
"multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace('\\n', ' ')\n",
"multi_train_df[\"target_text\"] = multi_train_df[\"toxic\"].str.cat(\n",
" multi_train_df[\n",
" [\n",
" col\n",
" for col in multi_train_df.columns\n",
" if col not in [\"id\", \"comment_text\", \"toxic\"]\n",
" ]\n",
" ],\n",
" sep=\",\",\n",
")\n",
"multi_train_df[\"target_text\"] = (\n",
" multi_train_df[\"target_text\"]\n",
" .apply(lambda x: \",\".join(word for word in x.split(\",\") if word))\n",
" .apply(lambda x: x if x else \"clean\")\n",
")\n",
"multi_train_df[\"input_text\"] = multi_train_df[\"comment_text\"].str.replace(\"\\n\", \" \")\n",
"multi_train_df[\"prefix\"] = \"multilabel classification\"\n",
"multi_train_df = multi_train_df[[\"prefix\", \"input_text\", \"target_text\"]]\n",
"\n",
Expand All @@ -191,15 +208,25 @@
}
],
"source": [
"prefix = 'data/regression/'\n",
"prefix = \"data/regression/\"\n",
"\n",
"sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
"sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\\t', error_bad_lines=False).dropna()\n",
"sts_train_df = pd.read_csv(\n",
" prefix + \"train.tsv\", sep=\"\\t\", error_bad_lines=False\n",
").dropna()\n",
"sts_eval_df = pd.read_csv(prefix + \"dev.tsv\", sep=\"\\t\", error_bad_lines=False).dropna()\n",
"\n",
"sts_train_df[\"sentence1\"] = sts_train_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"sts_train_df[\"sentence2\"] = sts_train_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"sts_eval_df[\"sentence1\"] = sts_eval_df[\"sentence1\"].str.replace('\\n', ' ').str.replace('\\t', ' ')\n",
"sts_eval_df[\"sentence2\"] = sts_eval_df[\"sentence2\"].str.replace('\\n', ' ').str.replace('\\t', ' ')"
"sts_train_df[\"sentence1\"] = (\n",
" sts_train_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")\n",
"sts_train_df[\"sentence2\"] = (\n",
" sts_train_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")\n",
"sts_eval_df[\"sentence1\"] = (\n",
" sts_eval_df[\"sentence1\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")\n",
"sts_eval_df[\"sentence2\"] = (\n",
" sts_eval_df[\"sentence2\"].str.replace(\"\\n\", \" \").str.replace(\"\\t\", \" \")\n",
")"
]
},
{
Expand All @@ -208,7 +235,7 @@
"metadata": {},
"outputs": [],
"source": [
"sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
"sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid."
]
},
{
Expand All @@ -217,11 +244,19 @@
"metadata": {},
"outputs": [],
"source": [
"sts_train_df[\"input_text\"] = sts_train_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
"sts_eval_df[\"input_text\"] = sts_eval_df.apply(lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1)\n",
"sts_train_df[\"input_text\"] = sts_train_df.apply(\n",
" lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
")\n",
"sts_eval_df[\"input_text\"] = sts_eval_df.apply(\n",
" lambda x: \"sentence1: \" + x[\"sentence1\"] + \" sentence2: \" + x[\"sentence2\"], axis=1\n",
")\n",
"\n",
"sts_train_df[\"target_text\"] = sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
"sts_eval_df[\"target_text\"] = sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
"sts_train_df[\"target_text\"] = (\n",
" sts_train_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
")\n",
"sts_eval_df[\"target_text\"] = (\n",
" sts_eval_df[\"score\"].apply(lambda x: round(x * 5) / 5).astype(str)\n",
")\n",
"\n",
"sts_train_df[\"prefix\"] = \"similarity\"\n",
"sts_eval_df[\"prefix\"] = \"similarity\"\n",
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="simpletransformers",
version="0.64.3",
version="0.64.5",
author="Thilina Rajapakse",
author_email="[email protected]",
description="An easy-to-use wrapper library for the Transformers library.",
Expand All @@ -32,6 +32,7 @@
"scikit-learn",
"seqeval",
"tensorboard",
"tensorboardx",
"pandas",
"tokenizers",
"wandb>=0.10.32",
Expand Down
9 changes: 6 additions & 3 deletions simpletransformers/classification/classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ def __init__(
onnx_execution_provider=None,
**kwargs,
):

"""
Initializes a ClassificationModel model.
Expand Down Expand Up @@ -1641,10 +1640,14 @@ def evaluate(

if not self.args.sliding_window:
# ROC`
wandb.log({"roc": wandb.plot.roc_curve(truth, model_outputs, labels_list)})
wandb.log(
{"roc": wandb.plot.roc_curve(truth, model_outputs, labels_list)}
)

# Precision Recall
wandb.log({"pr": wandb.plot.pr_curve(truth, model_outputs, labels_list)})
wandb.log(
{"pr": wandb.plot.pr_curve(truth, model_outputs, labels_list)}
)

return results, model_outputs, wrong

Expand Down
1 change: 0 additions & 1 deletion simpletransformers/classification/classification_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,6 @@ def __init__(
data_type_extension=None,
multi_label=False,
):

self.text_label = text_label if text_label else "text"
self.labels_label = labels_label if labels_label else "labels"
self.images_label = images_label if images_label else "images"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ def __init__(
cuda_device=-1,
**kwargs,
):

"""
Initializes a MultiLabelClassification model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def __init__(
cuda_device=-1,
**kwargs,
):

"""
Initializes a MultiModalClassificationModel model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def forward(
inputs_embeds=None,
labels=None,
):

outputs = self.albert(
input_ids=input_ids,
attention_mask=attention_mask,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def forward(
inputs_embeds=None,
labels=None,
):

outputs = self.bert(
input_ids,
attention_mask=attention_mask,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def forward(
inputs_embeds=None,
labels=None,
):

discriminator_hidden_states = self.electra(
input_ids,
attention_mask,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def forward(
inputs_embeds=None,
labels=None,
):

outputs = self.bert(
input_ids=input_ids,
bbox=bbox,
Expand Down
Loading

0 comments on commit 5219fe7

Please sign in to comment.