Skip to content

Commit

Permalink
Update tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
singletongue committed Oct 9, 2019
1 parent a801e58 commit 823e4e9
Showing 1 changed file with 27 additions and 174 deletions.
201 changes: 27 additions & 174 deletions masked_lm_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,18 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"BERT_BASE_DIR = '/Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k'"
"BERT_BASE_DIR = '/Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"I1001 22:27:35.863064 4618630592 file_utils.py:39] PyTorch version 1.2.0 available.\n",
"I1001 22:27:36.331261 4618630592 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n"
]
}
],
"outputs": [],
"source": [
"import torch\n",
"from transformers import BertForMaskedLM\n",
Expand All @@ -31,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -40,103 +31,52 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text = '今年の冬は友達と北海道に行きました。'"
"text = '朝食に[MASK]と牛乳を食べました。'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokens = ['[CLS]'] + tokenizer.tokenize(text)"
"token_ids = tokenizer.encode(text, add_special_tokens=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['[CLS]', '今年', 'の', '冬', 'は', '友達', 'と', '北海道', 'に', '行き', 'まし', 'た', '。']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokens"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokens[7] = '[MASK]'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['[CLS]', '今年', 'の', '冬', 'は', '友達', 'と', '[MASK]', 'に', '行き', 'まし', 'た', '。']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokens"
"token_ids"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"token_ids = tokenizer.convert_tokens_to_ids(tokens)"
"tokens = tokenizer.convert_ids_to_tokens(token_ids)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2, 18337, 5, 2558, 9, 11680, 13, 4, 7, 2563, 3926, 10, 8]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"token_ids"
"tokens"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -145,69 +85,25 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[ 2, 18337, 5, 2558, 9, 11680, 13, 4, 7, 2563,\n",
" 3926, 10, 8]])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"token_ids"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"I1001 22:27:42.803631 4618630592 configuration_utils.py:148] loading configuration file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/config.json\n",
"I1001 22:27:42.805089 4618630592 configuration_utils.py:168] Model config {\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"finetuning_task\": null,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-12,\n",
" \"max_position_embeddings\": 512,\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"num_labels\": 2,\n",
" \"output_attentions\": false,\n",
" \"output_hidden_states\": false,\n",
" \"pruned_heads\": {},\n",
" \"torchscript\": false,\n",
" \"type_vocab_size\": 2,\n",
" \"use_bfloat16\": false,\n",
" \"vocab_size\": 32000\n",
"}\n",
"\n",
"I1001 22:27:42.806373 4618630592 modeling_utils.py:334] loading weights file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/pytorch_model.bin\n",
"I1001 22:27:45.143396 4618630592 modeling_utils.py:408] Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n"
]
}
],
"outputs": [],
"source": [
"model = BertForMaskedLM.from_pretrained(BERT_BASE_DIR)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -216,7 +112,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -225,61 +121,18 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[ 6, 8, 1191, 11680, 13, 14142, 10, 12944, 4733, 6115],\n",
" [18337, 18822, 2558, 1052, 1331, 1460, 4960, 1322, 19, 7246],\n",
" [ 5, 6, 28, 9, 52, 13, 40, 60, 1191, 18],\n",
" [ 2558, 1460, 1383, 1331, 1158, 72, 4587, 7885, 8211, 51],\n",
" [ 9, 28, 6, 7, 5, 12, 14966, 1191, 10590, 40],\n",
" [11680, 8080, 3681, 3713, 6296, 2286, 14066, 15884, 1052, 2569],\n",
" [ 13, 12, 7, 25350, 996, 5, 14, 4338, 11, 21693],\n",
" [ 8135, 4338, 294, 6128, 1767, 399, 466, 1743, 2711, 292],\n",
" [ 7, 118, 16, 12, 14, 28444, 11, 6115, 15, 6],\n",
" [ 2563, 521, 19874, 21, 4154, 11438, 1676, 1258, 15, 1220],\n",
" [ 3926, 13259, 2554, 6771, 15, 3959, 12727, 303, 1158, 3061],\n",
" [ 10, 16, 3203, 28445, 807, 3287, 28480, 7428, 15, 17167],\n",
" [ 8, 10, 141, 937, 659, 6, 11162, 3061, 1901, 14]]])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"top10_pred_ids"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['[CLS]'] ['、', '。', '初めて', '友達', 'と', 'すご', 'た', 'たくさん', 'そんな', 'って']\n",
"['今年'] ['今年', '昨年', '冬', '自分', '秋', '夏', '最近', '私', '年', '今回']\n",
"['の'] ['の', '、', 'も', 'は', '一', 'と', 'から', '-', '初めて', '1']\n",
"['冬'] ['冬', '夏', '春', '秋', '始め', '時', 'オフ', '冬季', 'クリスマス', '中']\n",
"['は'] ['は', 'も', '、', 'に', 'の', 'で', 'いっぱい', '初めて', 'ずっと', 'から']\n",
"['友達'] ['友達', 'みんな', '友人', '仲間', '僕', '家族', 'いろいろ', '色々', '自分', '君']\n",
"['と'] ['と', 'で', 'に', 'ちゃんと', 'と共に', 'の', 'が', '一緒', 'を', 'だって']\n",
"['[MASK]'] ['遊び', '一緒', 'アメリカ', 'ハワイ', '韓国', '東京', '学校', '北海道', '沖縄', '海']\n",
"['に'] ['に', 'へ', 'て', 'で', 'が', '##に', 'を', 'って', 'し', '、']\n",
"['行き'] ['行き', '行っ', '行け', 'い', 'いき', '帰り', '行い', '来', 'し', '入り']\n",
"['まし'] ['まし', 'でし', 'ます', 'ませ', 'し', 'っ', 'だし', 'だっ', '始め', 'です']\n",
"['た'] ['た', 'て', 'ちゃ', '##た', 'たり', 'たら', '##こ', 'てる', 'し', 'たき']\n",
"['。'] ['。', 'た', 'という', 'ので', '!', '、', 'けど', 'です', 'ね', 'が']\n"
]
}
],
"outputs": [],
"source": [
"for correct_id, pred_ids in zip(token_ids[0], top10_pred_ids[0]):\n",
" correct_token = tokenizer.convert_ids_to_tokens([correct_id.item()])\n",
Expand Down

0 comments on commit 823e4e9

Please sign in to comment.