Skip to content

Commit

Permalink
Update an example notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
singletongue committed May 19, 2023
1 parent 71522e9 commit 5b65e96
Showing 1 changed file with 52 additions and 23 deletions.
75 changes: 52 additions & 23 deletions masked_lm_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,54 +3,75 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/m-suzuki/Projects/bert-japanese/venv/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"2023-05-19 10:03:53.353302: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"import torch\n",
"from transformers import BertJapaneseTokenizer, BertForMaskedLM"
"from transformers import AutoModelForMaskedLM, AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"model_name_or_path = \"cl-tohoku/bert-base-japanese-v2\""
"model_name_or_path = \"cl-tohoku/bert-base-japanese-v3\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)"
"tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
"Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
}
],
"source": [
"model = BertForMaskedLM.from_pretrained(model_name_or_path)"
"model = AutoModelForMaskedLM.from_pretrained(model_name_or_path)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"input_ids = tokenizer.encode(f\"青葉山で{tokenizer.mask_token}の研究をしています。\", return_tensors=\"pt\")"
Expand All @@ -59,14 +80,16 @@
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 2, 21479, 2077, 889, 4, 896, 11261, 932, 873, 888,\n",
" 854, 12343, 829, 3]])\n"
"tensor([[ 2, 22033, 1872, 457, 4, 464, 12605, 500, 441, 456,\n",
" 422, 12995, 385, 3]])\n"
]
}
],
Expand All @@ -77,7 +100,9 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
Expand All @@ -94,7 +119,9 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
Expand All @@ -112,17 +139,19 @@
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[CLS] 青葉 山 で 雪 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 山 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 花 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 植物 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 鳥類 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 野鳥 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 恐竜 の 研究 を し て い ます 。 [SEP]\n",
"[CLS] 青葉 山 で 昆虫 の 研究 を し て い ます 。 [SEP]\n"
"[CLS] 青葉 山 で 星 の 研究 を し て い ます 。 [SEP]\n"
]
}
],
Expand All @@ -146,7 +175,7 @@
"metadata": {
"file_extension": ".py",
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -160,7 +189,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
"version": "3.9.16"
},
"mimetype": "text/x-python",
"name": "python",
Expand Down

0 comments on commit 5b65e96

Please sign in to comment.