From 823e4e935395533f42ec1e456c7409c11de10735 Mon Sep 17 00:00:00 2001 From: Masatoshi Suzuki Date: Wed, 9 Oct 2019 15:50:20 +0900 Subject: [PATCH] Update tokenizers --- masked_lm_example.ipynb | 201 ++++++---------------------------------- 1 file changed, 27 insertions(+), 174 deletions(-) diff --git a/masked_lm_example.ipynb b/masked_lm_example.ipynb index cf0f2d4..ccce009 100644 --- a/masked_lm_example.ipynb +++ b/masked_lm_example.ipynb @@ -2,27 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "BERT_BASE_DIR = '/Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k'" + "BERT_BASE_DIR = '/Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/'" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "I1001 22:27:35.863064 4618630592 file_utils.py:39] PyTorch version 1.2.0 available.\n", - "I1001 22:27:36.331261 4618630592 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n" - ] - } - ], + "outputs": [], "source": [ "import torch\n", "from transformers import BertForMaskedLM\n", @@ -31,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -40,103 +31,52 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "text = '今年の冬は友達と北海道に行きました。'" + "text = '朝食に[MASK]と牛乳を食べました。'" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "tokens = ['[CLS]'] + tokenizer.tokenize(text)" + "token_ids = tokenizer.encode(text, add_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['[CLS]', '今年', 'の', '冬', 'は', '友達', 'と', '北海道', 'に', '行き', 'まし', 'た', '。']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "tokens[7] = '[MASK]'" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['[CLS]', '今年', 'の', '冬', 'は', '友達', 'と', '[MASK]', 'に', '行き', 'まし', 'た', '。']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokens" + "token_ids" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "token_ids = tokenizer.convert_tokens_to_ids(tokens)" + "tokens = tokenizer.convert_ids_to_tokens(token_ids)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[2, 18337, 5, 2558, 9, 11680, 13, 4, 7, 2563, 3926, 10, 8]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "token_ids" + "tokens" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -145,69 +85,25 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 2, 18337, 5, 2558, 9, 11680, 13, 4, 7, 2563,\n", - " 3926, 10, 8]])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "token_ids" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "I1001 22:27:42.803631 4618630592 configuration_utils.py:148] loading configuration file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/config.json\n", - "I1001 22:27:42.805089 4618630592 configuration_utils.py:168] Model config {\n", - " \"attention_probs_dropout_prob\": 0.1,\n", - " \"finetuning_task\": null,\n", - " \"hidden_act\": \"gelu\",\n", - " \"hidden_dropout_prob\": 0.1,\n", - " \"hidden_size\": 768,\n", - " \"initializer_range\": 0.02,\n", - " \"intermediate_size\": 3072,\n", - " \"layer_norm_eps\": 1e-12,\n", - " \"max_position_embeddings\": 512,\n", - " \"num_attention_heads\": 12,\n", - " \"num_hidden_layers\": 12,\n", - " \"num_labels\": 2,\n", - " \"output_attentions\": false,\n", - " \"output_hidden_states\": false,\n", - " \"pruned_heads\": {},\n", - " \"torchscript\": false,\n", - " \"type_vocab_size\": 2,\n", - " \"use_bfloat16\": false,\n", - " \"vocab_size\": 32000\n", - "}\n", - "\n", - "I1001 22:27:42.806373 4618630592 modeling_utils.py:334] loading weights file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/pytorch_model.bin\n", - "I1001 22:27:45.143396 4618630592 modeling_utils.py:408] Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n" - ] - } - ], + "outputs": [], "source": [ "model = BertForMaskedLM.from_pretrained(BERT_BASE_DIR)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -216,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -225,61 +121,18 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ 6, 8, 1191, 11680, 13, 14142, 10, 12944, 4733, 6115],\n", - " [18337, 18822, 2558, 1052, 1331, 1460, 4960, 1322, 19, 7246],\n", - " [ 5, 6, 28, 9, 52, 13, 40, 60, 1191, 18],\n", - " [ 2558, 1460, 1383, 1331, 1158, 72, 4587, 7885, 8211, 51],\n", - " [ 9, 28, 6, 7, 5, 12, 14966, 1191, 10590, 40],\n", - " [11680, 8080, 3681, 3713, 6296, 2286, 14066, 15884, 1052, 2569],\n", - " [ 13, 12, 7, 25350, 996, 5, 14, 4338, 11, 21693],\n", - " [ 8135, 4338, 294, 6128, 1767, 399, 466, 1743, 2711, 292],\n", - " [ 7, 118, 16, 12, 14, 28444, 11, 6115, 15, 6],\n", - " [ 2563, 521, 19874, 21, 4154, 11438, 1676, 1258, 15, 1220],\n", - " [ 3926, 13259, 2554, 6771, 15, 3959, 12727, 303, 1158, 3061],\n", - " [ 10, 16, 3203, 28445, 807, 3287, 28480, 7428, 15, 17167],\n", - " [ 8, 10, 141, 937, 659, 6, 11162, 3061, 1901, 14]]])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "top10_pred_ids" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['[CLS]'] ['、', '。', '初めて', '友達', 'と', 'すご', 'た', 'たくさん', 'そんな', 'って']\n", - "['今年'] ['今年', '昨年', '冬', '自分', '秋', '夏', '最近', '私', '年', '今回']\n", - "['の'] ['の', '、', 'も', 'は', '一', 'と', 'から', '-', '初めて', '1']\n", - "['冬'] ['冬', '夏', '春', '秋', '始め', '時', 'オフ', '冬季', 'クリスマス', '中']\n", - "['は'] ['は', 'も', '、', 'に', 'の', 'で', 'いっぱい', '初めて', 'ずっと', 'から']\n", - "['友達'] ['友達', 'みんな', '友人', '仲間', '僕', '家族', 'いろいろ', '色々', '自分', '君']\n", - "['と'] ['と', 'で', 'に', 'ちゃんと', 'と共に', 'の', 'が', '一緒', 'を', 'だって']\n", - "['[MASK]'] ['遊び', '一緒', 'アメリカ', 'ハワイ', '韓国', '東京', '学校', '北海道', '沖縄', '海']\n", - "['に'] ['に', 'へ', 'て', 'で', 'が', '##に', 'を', 'って', 'し', '、']\n", - "['行き'] ['行き', '行っ', '行け', 'い', 'いき', '帰り', '行い', '来', 'し', '入り']\n", - "['まし'] ['まし', 'でし', 'ます', 'ませ', 'し', 'っ', 'だし', 'だっ', '始め', 'です']\n", - "['た'] ['た', 'て', 'ちゃ', '##た', 'たり', 'たら', '##こ', 'てる', 'し', 'たき']\n", - "['。'] ['。', 'た', 'という', 'ので', '!', '、', 'けど', 'です', 'ね', 'が']\n" - ] - } - ], + "outputs": [], "source": [ "for correct_id, pred_ids in zip(token_ids[0], top10_pred_ids[0]):\n", " correct_token = tokenizer.convert_ids_to_tokens([correct_id.item()])\n",