Skip to content

Commit

Permalink
Update an example.
Browse files Browse the repository at this point in the history
  • Loading branch information
singletongue committed Nov 6, 2019
1 parent e8e3834 commit d007e78
Showing 1 changed file with 29 additions and 24 deletions.
53 changes: 29 additions & 24 deletions masked_lm_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"metadata": {},
"outputs": [],
"source": [
"BERT_BASE_DIR = '/Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask'"
"# specify the directory where the model files are stored\n",
"BERT_BASE_DIR = '/path/to/mecab-ipadic-bpe-32k/do-whole-word-mask'"
]
},
{
Expand All @@ -18,16 +19,16 @@
"name": "stderr",
"output_type": "stream",
"text": [
"I1030 22:45:41.339545 4452584896 file_utils.py:32] TensorFlow version 2.0.0 available.\n",
"I1030 22:45:41.340346 4452584896 file_utils.py:39] PyTorch version 1.3.0.post2 available.\n",
"I1030 22:45:41.590382 4452584896 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n"
"I1106 11:10:32.632133 4675833280 file_utils.py:32] TensorFlow version 2.0.0 available.\n",
"I1106 11:10:32.632836 4675833280 file_utils.py:39] PyTorch version 1.3.0.post2 available.\n",
"I1106 11:10:32.935988 4675833280 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n"
]
}
],
"source": [
"import torch\n",
"from transformers import BertForMaskedLM\n",
"from tokenization import MecabBertTokenizer"
"from tokenization import MecabBertTokenizer, MecabCharacterBertTokenizer"
]
},
{
Expand All @@ -39,8 +40,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"I1030 22:45:41.636089 4452584896 configuration_utils.py:148] loading configuration file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask/config.json\n",
"I1030 22:45:41.637474 4452584896 configuration_utils.py:168] Model config {\n",
"I1106 11:10:35.046508 4675833280 configuration_utils.py:148] loading configuration file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask/config.json\n",
"I1106 11:10:35.047818 4675833280 configuration_utils.py:168] Model config {\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"finetuning_task\": null,\n",
" \"hidden_act\": \"gelu\",\n",
Expand All @@ -63,8 +64,8 @@
" \"vocab_size\": 32000\n",
"}\n",
"\n",
"I1030 22:45:41.638771 4452584896 modeling_utils.py:334] loading weights file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask/pytorch_model.bin\n",
"I1030 22:45:43.935035 4452584896 modeling_utils.py:408] Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n"
"I1106 11:10:35.049305 4675833280 modeling_utils.py:334] loading weights file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask/pytorch_model.bin\n",
"I1106 11:10:37.441706 4675833280 modeling_utils.py:408] Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n"
]
}
],
Expand All @@ -78,7 +79,8 @@
"metadata": {},
"outputs": [],
"source": [
"tokenizer = MecabBertTokenizer(vocab_file=f'{BERT_BASE_DIR}/vocab.txt')"
"tokenizer = MecabBertTokenizer(vocab_file=f'{BERT_BASE_DIR}/vocab.txt')\n",
"# Use MecabCharacterBertTokenizer instead for char-4k models"
]
},
{
Expand All @@ -87,7 +89,7 @@
"metadata": {},
"outputs": [],
"source": [
"text = '朝食に[MASK]を食べました。'"
"text = '朝食に[MASK]を焼いて食べました。'"
]
},
{
Expand All @@ -107,7 +109,7 @@
{
"data": {
"text/plain": [
"[2, 25965, 7, 4, 11, 2949, 3913, 10, 8, 3]"
"[2, 25965, 7, 4, 11, 16878, 16, 2949, 3913, 10, 8, 3]"
]
},
"execution_count": 7,
Expand Down Expand Up @@ -136,7 +138,7 @@
{
"data": {
"text/plain": [
"['[CLS]', '朝食', 'に', '[MASK]', 'を', '食べ', 'まし', 'た', '。', '[SEP]']"
"['[CLS]', '朝食', 'に', '[MASK]', 'を', '焼い', 'て', '食べ', 'まし', 'た', '。', '[SEP]']"
]
},
"execution_count": 9,
Expand Down Expand Up @@ -165,7 +167,8 @@
{
"data": {
"text/plain": [
"tensor([[ 2, 25965, 7, 4, 11, 2949, 3913, 10, 8, 3]])"
"tensor([[ 2, 25965, 7, 4, 11, 16878, 16, 2949, 3913, 10,\n",
" 8, 3]])"
]
},
"execution_count": 11,
Expand All @@ -186,16 +189,18 @@
"name": "stdout",
"output_type": "stream",
"text": [
"['[CLS]'] ['、', '」', 'まし', 'て', 'た', 'です', '朝食', 'お', '朝', '今']\n",
"['朝食'] ['朝食', '朝', '夕食', '早朝', '午後', '昼', '最初', '午前', '食事', '代わり']\n",
"['に'] ['に', 'は', 'として', 'の', '用', 'で', '2', '、', 'ニ', '後']\n",
"['[MASK]'] ['パン', '朝食', 'ワイン', 'ご飯', 'コーヒー', 'カレー', '野菜', '魚', 'サラダ', '果物']\n",
"['を'] ['を', '##食', '##イッチ', 'が', 'も', 'と', 'は', '##テト', 'の', 'カレー']\n",
"['食べ'] ['食べ', '飲み', '与え', 'し', '食い', '食べる', '待ち', '受け', '始め', '焼き']\n",
"['まし'] ['まし', 'でし', 'ます', 'ましょ', 'ませ', 'し', 'て', 'です', 'ちゃっ', '直し']\n",
"['た'] ['た', 'て', '」', 'たら', 'ます', 'まし', 'し', 'よ', 'り', 'ん']\n",
"['。'] ['。', '」', '!', '...。', 'が', '、', '......', ':', '』', '......。']\n",
"['[SEP]'] ['。', '」', '、', '(', ':', 'し', 'は', 'て', '\"。', 'た']\n"
"['[CLS]'] ['た', '」', '朝食', 'て', 'お', '、', 'まし', 'です', '朝', '。']\n",
"['朝食'] ['朝食', '朝', '夕食', '早朝', '最初', '午後', '昼', '代わり', '食事', '夕方']\n",
"['に'] ['に', 'は', 'として', 'の', '用', 'を', '中', '後', 'で', 'にかけて']\n",
"['[MASK]'] ['パン', '肉', 'ご飯', '豚肉', 'ハム', '野菜', '牛肉', '[UNK]', 'ケーキ', 'バター']\n",
"['を'] ['を', '##パン', 'は', '##焼き', 'に', '##肉', 'パン', '##を', '##ト', 'で']\n",
"['焼い'] ['焼い', '焼く', '焼き', '作っ', 'し', '燃やし', '巻い', '使っ', '揚げ', '買っ']\n",
"['て'] ['て', 'で', '##て', 'ながら', 'たら', 'を', 'から', 'って', 'た', 'に']\n",
"['食べ'] ['食べ', 'い', '飲み', 'おり', '待ち', '食べる', '食', '食事', '見', 'もらい']\n",
"['まし'] ['まし', 'でし', 'ましょ', 'ませ', 'ます', 'です', 'し', 'られ', 'て', 'でしょ']\n",
"['た'] ['た', 'て', '」', 'ます', 'まし', 'し', 'たい', 'たら', 'ん', 'たり']\n",
"['。'] ['。', '」', '!', 'が', '...。', '、', 'から', ':', '......', 'よ']\n",
"['[SEP]'] ['。', 'て', '、', 'に', '(', 'た', 'は', '」', 'し', ')。']\n"
]
}
],
Expand Down

0 comments on commit d007e78

Please sign in to comment.