From bc089748d2a6ac4d0f6ebb9957f822191c602273 Mon Sep 17 00:00:00 2001 From: huseinzol05 Date: Wed, 16 Mar 2022 02:37:47 +0800 Subject: [PATCH] use better url regex --- docs/load-tokenizer.ipynb | 348 ++++-- example/tokenizer/load-tokenizer.ipynb | 348 ++++-- load-tokenizer.ipynb | 1527 ++++++++++++++++++++++++ malaya/text/regex.py | 4 + malaya/tokenizer.py | 10 +- 5 files changed, 1972 insertions(+), 265 deletions(-) create mode 100644 load-tokenizer.ipynb diff --git a/docs/load-tokenizer.ipynb b/docs/load-tokenizer.ipynb index 3fe3179f..70d6a7e5 100644 --- a/docs/load-tokenizer.ipynb +++ b/docs/load-tokenizer.ipynb @@ -27,16 +27,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.91 s, sys: 1.12 s, total: 7.03 s\n", - "Wall time: 7.62 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/huseinzolkepli/Documents/Malaya/malaya/preprocessing.py:259: FutureWarning: Possible nested set at position 2289\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" + "CPU times: user 6.52 s, sys: 1.42 s, total: 7.94 s\n", + "Wall time: 9.94 s\n" ] } ], @@ -57,7 +49,10 @@ "string4 = 'pada 10/4, kementerian mengumumkan, 1/100'\n", "string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'\n", "string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'\n", - "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'" + "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'\n", + "string8 = 'online & desktop: regexr.com or download the desktop version for Mac'\n", + "string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'\n", + "string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'" ] }, { @@ -68,10 +63,11 @@ "\n", "```python\n", "class Tokenizer:\n", - " def __init__(self, lowercase = False, **kwargs):\n", + " def __init__(self, lowercase: bool = False, **kwargs):\n", " \"\"\"\n", - " Load Tokenizer object. \n", - " Check supported regex pattern at https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", + " Load Tokenizer object.\n", + " Check supported regex pattern at \n", + " https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", "\n", " Parameters\n", " ----------\n", @@ -81,6 +77,8 @@ " True to keep emojis.\n", " urls: bool, optional (default=True)\n", " True to keep urls.\n", + " urls_improved: bool, optional (default=True)\n", + " True to keep urls, better version.\n", " tags: bool, optional (default=True)\n", " True to keep tags: .\n", " emails: bool, optional (default=True)\n", @@ -131,7 +129,7 @@ "metadata": {}, "outputs": [], "source": [ - "tokenizer = malaya.preprocessing.Tokenizer()" + "tokenizer = malaya.tokenizer.Tokenizer()" ] }, { @@ -303,7 +301,9 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -337,6 +337,57 @@ "tokenizer.tokenize(string7)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['online',\n", + " '&',\n", + " 'desktop',\n", + " ':',\n", + " 'regexr.com',\n", + " 'or',\n", + " 'download',\n", + " 'the',\n", + " 'desktop',\n", + " 'version',\n", + " 'for',\n", + " 'Mac']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string8)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string9)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -346,8 +397,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": 12, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -355,7 +408,7 @@ "['website', 'saya', 'http://huseinhouse.com']" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -364,6 +417,46 @@ "tokenizer.tokenize('website saya http://huseinhouse.com')" ] }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'huseinhouse.com']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya huseinhouse.com')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'huseinhouse.com/pelik?a=1']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -373,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -382,7 +475,7 @@ "['panggil', 'saya', '']" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -393,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -402,7 +495,7 @@ "['panggil', 'saya', '<', 'husein', '>']" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -420,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -429,7 +522,7 @@ "['email', 'saya', 'husein@rumah.com']" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -440,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -449,7 +542,7 @@ "['email', 'saya', 'husein@rumah.com.my']" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -467,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -476,7 +569,7 @@ "['twitter', 'saya', '@husein123zolkepli']" ] }, - "execution_count": 16, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -487,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -496,7 +589,7 @@ "['twitter', 'saya', '@', 'husein123zolkepli']" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -514,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -523,7 +616,7 @@ "['panggil', 'saya', '#huseincomel']" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -534,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -543,7 +636,7 @@ "['panggil', 'saya', '#', 'huseincomel']" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -561,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -570,7 +663,7 @@ "['call', 'sye', 'di', '013-1234567']" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -581,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -590,7 +683,7 @@ "['call', 'sye', 'di', '013', '-', '1234567']" ] }, - "execution_count": 27, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -608,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -617,7 +710,7 @@ "['saya', 'sokong', '100%']" ] }, - "execution_count": 28, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -628,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -637,7 +730,7 @@ "['saya', 'sokong', '100', '%']" ] }, - "execution_count": 29, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -655,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -664,7 +757,7 @@ "['saya', 'tinggal', 'rm100']" ] }, - "execution_count": 30, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -675,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -684,7 +777,7 @@ "['saya', 'tinggal', 'rm100k']" ] }, - "execution_count": 31, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -695,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -704,7 +797,7 @@ "['saya', 'tinggal', 'rm100M']" ] }, - "execution_count": 32, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -715,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -724,7 +817,7 @@ "['saya', 'tinggal', 'rm100.123M']" ] }, - "execution_count": 33, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -735,7 +828,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -744,7 +837,7 @@ "['saya', 'tinggal', '40 sen']" ] }, - "execution_count": 34, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -755,7 +848,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -764,7 +857,7 @@ "['saya', 'tinggal', '21 ringgit', '50 sen']" ] }, - "execution_count": 35, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -782,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -791,7 +884,7 @@ "['tarikh', 'perjumpaan', '10/11/2011']" ] }, - "execution_count": 36, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -802,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -811,7 +904,7 @@ "['tarikh', 'perjumpaan', '10-11-2011']" ] }, - "execution_count": 37, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -822,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -831,7 +924,7 @@ "['tarikh', 'perjumpaan', '12 mei 2011']" ] }, - "execution_count": 38, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -842,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -851,7 +944,7 @@ "['tarikh', 'perjumpaan', 'mei 12 2011']" ] }, - "execution_count": 39, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -869,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -878,7 +971,7 @@ "['jumpa', '3 am']" ] }, - "execution_count": 40, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -889,7 +982,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -898,7 +991,7 @@ "['jumpa', '22:00']" ] }, - "execution_count": 41, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -916,7 +1009,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -925,7 +1018,7 @@ "['f**k', 'lah']" ] }, - "execution_count": 42, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -943,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -952,7 +1045,7 @@ "['*damn*', 'good', 'weih']" ] }, - "execution_count": 43, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -970,7 +1063,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -979,7 +1072,7 @@ "['no', 'saya', '123']" ] }, - "execution_count": 44, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -997,7 +1090,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -1006,7 +1099,7 @@ "['sejuk', 'harini', ',', '31.1c']" ] }, - "execution_count": 45, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1017,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -1026,7 +1119,7 @@ "['sejuk', 'harini', ',', '31.1C']" ] }, - "execution_count": 46, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1044,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -1053,7 +1146,7 @@ "['nak', 'sampai', 'lagi', '31km']" ] }, - "execution_count": 47, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1064,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -1073,7 +1166,7 @@ "['nak', 'sampai', 'lagi', '31 km']" ] }, - "execution_count": 48, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -1091,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -1100,7 +1193,7 @@ "['botol', 'ni', '400ml']" ] }, - "execution_count": 49, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -1111,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -1120,7 +1213,7 @@ "['botol', 'ni', '400 l']" ] }, - "execution_count": 50, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -1138,7 +1231,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -1147,7 +1240,7 @@ "['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']" ] }, - "execution_count": 51, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1158,7 +1251,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -1167,7 +1260,7 @@ "['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']" ] }, - "execution_count": 52, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1178,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -1187,7 +1280,7 @@ "['lagi', '10 minit', '3 jam']" ] }, - "execution_count": 53, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -1205,7 +1298,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1214,7 +1307,7 @@ "['berat', 'kau', '60 kg']" ] }, - "execution_count": 54, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1225,7 +1318,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -1234,7 +1327,7 @@ "['berat', 'kau', '60kg']" ] }, - "execution_count": 55, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1252,7 +1345,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -1261,7 +1354,7 @@ "['sememang-memangnya', 'kau', 'sakai']" ] }, - "execution_count": 56, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1272,7 +1365,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -1281,7 +1374,7 @@ "['sememang', '-', 'memangnya', 'kau', 'sakai']" ] }, - "execution_count": 57, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1299,26 +1392,15 @@ "We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, time and month to split a sentence into multiple sentences.\n", "\n", "```python\n", - "def split_into_sentences(text, minimum_length = 5):\n", - " \"\"\"\n", - " Sentence tokenizer.\n", - "\n", - " Parameters\n", - " ----------\n", - " text: str\n", - " minimum_length: int, optional (default=5)\n", - " minimum length to assume a string is a string, default 5 characters.\n", - " \n", - " Returns\n", - " -------\n", - " result: List[str]\n", - " \"\"\"\n", + "class SentenceTokenizer:\n", + " def __init__(self):\n", + " pass\n", "```" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1329,7 +1411,16 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "s_tokenizer = malaya.tokenizer.SentenceTokenizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -1339,18 +1430,18 @@ " 'polis tembak pui pui pui bertubi tubi.']" ] }, - "execution_count": 59, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.text.function.split_into_sentences(s)" + "s_tokenizer.tokenize(s)" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1361,7 +1452,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -1370,18 +1461,18 @@ "['email saya di husein.zol01@gmail.com, nanti jom berkopi.']" ] }, - "execution_count": 61, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.text.function.split_into_sentences(s)" + "s_tokenizer.tokenize(s)" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1392,7 +1483,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -1402,21 +1493,14 @@ " 'saya berjalan jalan ditepi muara jumpa anak dara.']" ] }, - "execution_count": 63, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.text.function.split_into_sentences(s)" + "s_tokenizer.tokenize(s)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/example/tokenizer/load-tokenizer.ipynb b/example/tokenizer/load-tokenizer.ipynb index 3fe3179f..70d6a7e5 100644 --- a/example/tokenizer/load-tokenizer.ipynb +++ b/example/tokenizer/load-tokenizer.ipynb @@ -27,16 +27,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.91 s, sys: 1.12 s, total: 7.03 s\n", - "Wall time: 7.62 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/huseinzolkepli/Documents/Malaya/malaya/preprocessing.py:259: FutureWarning: Possible nested set at position 2289\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" + "CPU times: user 6.52 s, sys: 1.42 s, total: 7.94 s\n", + "Wall time: 9.94 s\n" ] } ], @@ -57,7 +49,10 @@ "string4 = 'pada 10/4, kementerian mengumumkan, 1/100'\n", "string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'\n", "string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'\n", - "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'" + "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'\n", + "string8 = 'online & desktop: regexr.com or download the desktop version for Mac'\n", + "string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'\n", + "string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'" ] }, { @@ -68,10 +63,11 @@ "\n", "```python\n", "class Tokenizer:\n", - " def __init__(self, lowercase = False, **kwargs):\n", + " def __init__(self, lowercase: bool = False, **kwargs):\n", " \"\"\"\n", - " Load Tokenizer object. \n", - " Check supported regex pattern at https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", + " Load Tokenizer object.\n", + " Check supported regex pattern at \n", + " https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", "\n", " Parameters\n", " ----------\n", @@ -81,6 +77,8 @@ " True to keep emojis.\n", " urls: bool, optional (default=True)\n", " True to keep urls.\n", + " urls_improved: bool, optional (default=True)\n", + " True to keep urls, better version.\n", " tags: bool, optional (default=True)\n", " True to keep tags: .\n", " emails: bool, optional (default=True)\n", @@ -131,7 +129,7 @@ "metadata": {}, "outputs": [], "source": [ - "tokenizer = malaya.preprocessing.Tokenizer()" + "tokenizer = malaya.tokenizer.Tokenizer()" ] }, { @@ -303,7 +301,9 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -337,6 +337,57 @@ "tokenizer.tokenize(string7)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['online',\n", + " '&',\n", + " 'desktop',\n", + " ':',\n", + " 'regexr.com',\n", + " 'or',\n", + " 'download',\n", + " 'the',\n", + " 'desktop',\n", + " 'version',\n", + " 'for',\n", + " 'Mac']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string8)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string9)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -346,8 +397,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": 12, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -355,7 +408,7 @@ "['website', 'saya', 'http://huseinhouse.com']" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -364,6 +417,46 @@ "tokenizer.tokenize('website saya http://huseinhouse.com')" ] }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'huseinhouse.com']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya huseinhouse.com')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'huseinhouse.com/pelik?a=1']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -373,7 +466,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -382,7 +475,7 @@ "['panggil', 'saya', '']" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -393,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -402,7 +495,7 @@ "['panggil', 'saya', '<', 'husein', '>']" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -420,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -429,7 +522,7 @@ "['email', 'saya', 'husein@rumah.com']" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -440,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -449,7 +542,7 @@ "['email', 'saya', 'husein@rumah.com.my']" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -467,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -476,7 +569,7 @@ "['twitter', 'saya', '@husein123zolkepli']" ] }, - "execution_count": 16, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -487,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -496,7 +589,7 @@ "['twitter', 'saya', '@', 'husein123zolkepli']" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -514,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -523,7 +616,7 @@ "['panggil', 'saya', '#huseincomel']" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -534,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -543,7 +636,7 @@ "['panggil', 'saya', '#', 'huseincomel']" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -561,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -570,7 +663,7 @@ "['call', 'sye', 'di', '013-1234567']" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -581,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -590,7 +683,7 @@ "['call', 'sye', 'di', '013', '-', '1234567']" ] }, - "execution_count": 27, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -608,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -617,7 +710,7 @@ "['saya', 'sokong', '100%']" ] }, - "execution_count": 28, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -628,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -637,7 +730,7 @@ "['saya', 'sokong', '100', '%']" ] }, - "execution_count": 29, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -655,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -664,7 +757,7 @@ "['saya', 'tinggal', 'rm100']" ] }, - "execution_count": 30, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -675,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -684,7 +777,7 @@ "['saya', 'tinggal', 'rm100k']" ] }, - "execution_count": 31, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -695,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -704,7 +797,7 @@ "['saya', 'tinggal', 'rm100M']" ] }, - "execution_count": 32, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -715,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -724,7 +817,7 @@ "['saya', 'tinggal', 'rm100.123M']" ] }, - "execution_count": 33, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -735,7 +828,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -744,7 +837,7 @@ "['saya', 'tinggal', '40 sen']" ] }, - "execution_count": 34, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -755,7 +848,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -764,7 +857,7 @@ "['saya', 'tinggal', '21 ringgit', '50 sen']" ] }, - "execution_count": 35, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -782,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -791,7 +884,7 @@ "['tarikh', 'perjumpaan', '10/11/2011']" ] }, - "execution_count": 36, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -802,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -811,7 +904,7 @@ "['tarikh', 'perjumpaan', '10-11-2011']" ] }, - "execution_count": 37, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -822,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -831,7 +924,7 @@ "['tarikh', 'perjumpaan', '12 mei 2011']" ] }, - "execution_count": 38, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -842,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -851,7 +944,7 @@ "['tarikh', 'perjumpaan', 'mei 12 2011']" ] }, - "execution_count": 39, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -869,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -878,7 +971,7 @@ "['jumpa', '3 am']" ] }, - "execution_count": 40, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -889,7 +982,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -898,7 +991,7 @@ "['jumpa', '22:00']" ] }, - "execution_count": 41, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -916,7 +1009,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -925,7 +1018,7 @@ "['f**k', 'lah']" ] }, - "execution_count": 42, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -943,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -952,7 +1045,7 @@ "['*damn*', 'good', 'weih']" ] }, - "execution_count": 43, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -970,7 +1063,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -979,7 +1072,7 @@ "['no', 'saya', '123']" ] }, - "execution_count": 44, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -997,7 +1090,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -1006,7 +1099,7 @@ "['sejuk', 'harini', ',', '31.1c']" ] }, - "execution_count": 45, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1017,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -1026,7 +1119,7 @@ "['sejuk', 'harini', ',', '31.1C']" ] }, - "execution_count": 46, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1044,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -1053,7 +1146,7 @@ "['nak', 'sampai', 'lagi', '31km']" ] }, - "execution_count": 47, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1064,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -1073,7 +1166,7 @@ "['nak', 'sampai', 'lagi', '31 km']" ] }, - "execution_count": 48, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -1091,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -1100,7 +1193,7 @@ "['botol', 'ni', '400ml']" ] }, - "execution_count": 49, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -1111,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -1120,7 +1213,7 @@ "['botol', 'ni', '400 l']" ] }, - "execution_count": 50, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -1138,7 +1231,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -1147,7 +1240,7 @@ "['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']" ] }, - "execution_count": 51, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1158,7 +1251,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -1167,7 +1260,7 @@ "['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']" ] }, - "execution_count": 52, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1178,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -1187,7 +1280,7 @@ "['lagi', '10 minit', '3 jam']" ] }, - "execution_count": 53, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -1205,7 +1298,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -1214,7 +1307,7 @@ "['berat', 'kau', '60 kg']" ] }, - "execution_count": 54, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1225,7 +1318,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -1234,7 +1327,7 @@ "['berat', 'kau', '60kg']" ] }, - "execution_count": 55, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1252,7 +1345,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -1261,7 +1354,7 @@ "['sememang-memangnya', 'kau', 'sakai']" ] }, - "execution_count": 56, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1272,7 +1365,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -1281,7 +1374,7 @@ "['sememang', '-', 'memangnya', 'kau', 'sakai']" ] }, - "execution_count": 57, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1299,26 +1392,15 @@ "We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, time and month to split a sentence into multiple sentences.\n", "\n", "```python\n", - "def split_into_sentences(text, minimum_length = 5):\n", - " \"\"\"\n", - " Sentence tokenizer.\n", - "\n", - " Parameters\n", - " ----------\n", - " text: str\n", - " minimum_length: int, optional (default=5)\n", - " minimum length to assume a string is a string, default 5 characters.\n", - " \n", - " Returns\n", - " -------\n", - " result: List[str]\n", - " \"\"\"\n", + "class SentenceTokenizer:\n", + " def __init__(self):\n", + " pass\n", "```" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1329,7 +1411,16 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "s_tokenizer = malaya.tokenizer.SentenceTokenizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -1339,18 +1430,18 @@ " 'polis tembak pui pui pui bertubi tubi.']" ] }, - "execution_count": 59, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.text.function.split_into_sentences(s)" + "s_tokenizer.tokenize(s)" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1361,7 +1452,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -1370,18 +1461,18 @@ "['email saya di husein.zol01@gmail.com, nanti jom berkopi.']" ] }, - "execution_count": 61, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.text.function.split_into_sentences(s)" + "s_tokenizer.tokenize(s)" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1392,7 +1483,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -1402,21 +1493,14 @@ " 'saya berjalan jalan ditepi muara jumpa anak dara.']" ] }, - "execution_count": 63, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.text.function.split_into_sentences(s)" + "s_tokenizer.tokenize(s)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/load-tokenizer.ipynb b/load-tokenizer.ipynb new file mode 100644 index 00000000..70d6a7e5 --- /dev/null +++ b/load-tokenizer.ipynb @@ -0,0 +1,1527 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word and sentence tokenizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "This tutorial is available as an IPython notebook at [Malaya/example/tokenizer](https://github.com/huseinzol05/Malaya/tree/master/example/tokenizer).\n", + " \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.52 s, sys: 1.42 s, total: 7.94 s\n", + "Wall time: 9.94 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'\n", + "string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'\n", + "string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'\n", + "string4 = 'pada 10/4, kementerian mengumumkan, 1/100'\n", + "string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'\n", + "string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'\n", + "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'\n", + "string8 = 'online & desktop: regexr.com or download the desktop version for Mac'\n", + "string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'\n", + "string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load word tokenizer\n", + "\n", + "```python\n", + "class Tokenizer:\n", + " def __init__(self, lowercase: bool = False, **kwargs):\n", + " \"\"\"\n", + " Load Tokenizer object.\n", + " Check supported regex pattern at \n", + " https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", + "\n", + " Parameters\n", + " ----------\n", + " lowercase: bool, optional (default=False)\n", + " lowercase tokens.\n", + " emojis: bool, optional (default=True)\n", + " True to keep emojis.\n", + " urls: bool, optional (default=True)\n", + " True to keep urls.\n", + " urls_improved: bool, optional (default=True)\n", + " True to keep urls, better version.\n", + " tags: bool, optional (default=True)\n", + " True to keep tags: .\n", + " emails: bool, optional (default=True)\n", + " True to keep emails.\n", + " users: bool, optional (default=True)\n", + " True to keep users handles: @cbaziotis.\n", + " hashtags: bool, optional (default=True)\n", + " True to keep hashtags.\n", + " phones: bool, optional (default=True)\n", + " True to keep phones.\n", + " percents: bool, optional (default=True)\n", + " True to keep percents.\n", + " money: bool, optional (default=True)\n", + " True to keep money expressions.\n", + " date: bool, optional (default=True)\n", + " True to keep date expressions.\n", + " time: bool, optional (default=True)\n", + " True to keep time expressions.\n", + " acronyms: bool, optional (default=True)\n", + " True to keep acronyms.\n", + " emoticons: bool, optional (default=True)\n", + " True to keep emoticons.\n", + " censored: bool, optional (default=True)\n", + " True to keep censored words: f**k.\n", + " emphasis: bool, optional (default=True)\n", + " True to keep words with emphasis: *very* good.\n", + " numbers: bool, optional (default=True)\n", + " True to keep numbers.\n", + " temperature: bool, optional (default=True)\n", + " True to keep temperatures\n", + " distance: bool, optional (default=True)\n", + " True to keep distances.\n", + " volume: bool, optional (default=True)\n", + " True to keep volumes.\n", + " duration: bool, optional (default=True)\n", + " True to keep durations.\n", + " weight: bool, optional (default=True)\n", + " True to keep weights.\n", + " hypen: bool, optional (default=True)\n", + " True to keep hypens.\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = malaya.tokenizer.Tokenizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['xjdi',\n", + " 'ke',\n", + " ',',\n", + " 'y',\n", + " 'u',\n", + " 'xsuke',\n", + " 'makan',\n", + " 'HUSEIN',\n", + " 'kt',\n", + " 'situ',\n", + " 'tmpt',\n", + " ',',\n", + " 'i',\n", + " 'hate',\n", + " 'it',\n", + " '.',\n", + " 'pelikle',\n", + " ',',\n", + " 'pada']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['i',\n", + " 'mmg2',\n", + " 'xske',\n", + " 'mknn',\n", + " 'HUSEIN',\n", + " 'kampng',\n", + " 'tmpat',\n", + " ',',\n", + " 'i',\n", + " 'love',\n", + " 'them',\n", + " '.',\n", + " 'pelikle',\n", + " 'saye']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['perdana',\n", + " 'menteri',\n", + " 'ke11',\n", + " 'sgt',\n", + " 'suka',\n", + " 'makn',\n", + " 'ayam',\n", + " ',',\n", + " 'harganya',\n", + " 'cuma',\n", + " 'rm15.50']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pada',\n", + " '10',\n", + " '/',\n", + " '4',\n", + " ',',\n", + " 'kementerian',\n", + " 'mengumumkan',\n", + " ',',\n", + " '1',\n", + " '/',\n", + " '100']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string4)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Husein',\n", + " 'Zolkepli',\n", + " '(',\n", + " '2011',\n", + " '-',\n", + " '2019',\n", + " ')',\n", + " 'adalah',\n", + " 'ketua',\n", + " 'kampng',\n", + " 'di',\n", + " 'kedah',\n", + " 'sekolah',\n", + " 'King',\n", + " 'Edward',\n", + " 'ke-IV']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string6)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['2jam',\n", + " '30 minit',\n", + " 'aku',\n", + " 'tunggu',\n", + " 'kau',\n", + " ',',\n", + " '60.1 kg',\n", + " 'kau',\n", + " 'ni',\n", + " ',',\n", + " 'suhu',\n", + " 'harini',\n", + " '31.2c',\n", + " ',',\n", + " 'aku',\n", + " 'dahaga',\n", + " 'minum',\n", + " '600ml']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string7)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['online',\n", + " '&',\n", + " 'desktop',\n", + " ':',\n", + " 'regexr.com',\n", + " 'or',\n", + " 'download',\n", + " 'the',\n", + " 'desktop',\n", + " 'version',\n", + " 'for',\n", + " 'Mac']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string8)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize(string9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### url" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'http://huseinhouse.com']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya http://huseinhouse.com')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'huseinhouse.com']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya huseinhouse.com')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['website', 'saya', 'huseinhouse.com/pelik?a=1']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### tags" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['panggil', 'saya', '']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('panggil saya ')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['panggil', 'saya', '<', 'husein', '>']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('panggil saya ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### emails" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['email', 'saya', 'husein@rumah.com']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('email saya husein@rumah.com')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['email', 'saya', 'husein@rumah.com.my']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('email saya husein@rumah.com.my')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### users" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['twitter', 'saya', '@husein123zolkepli']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('twitter saya @husein123zolkepli')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['twitter', 'saya', '@', 'husein123zolkepli']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('twitter saya @ husein123zolkepli')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### hashtags" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['panggil', 'saya', '#huseincomel']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('panggil saya #huseincomel')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['panggil', 'saya', '#', 'huseincomel']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('panggil saya # huseincomel')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### phones" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['call', 'sye', 'di', '013-1234567']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('call sye di 013-1234567')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['call', 'sye', 'di', '013', '-', '1234567']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('call sye di 013- 1234567')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### percents" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'sokong', '100%']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya sokong 100%')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'sokong', '100', '%']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya sokong 100 %')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### money" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'tinggal', 'rm100']" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya tinggal rm100')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'tinggal', 'rm100k']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya tinggal rm100k')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'tinggal', 'rm100M']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya tinggal rm100M')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'tinggal', 'rm100.123M']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya tinggal rm100.123M')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'tinggal', '40 sen']" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya tinggal 40 sen')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['saya', 'tinggal', '21 ringgit', '50 sen']" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('saya tinggal 21 ringgit 50 sen')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### date" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tarikh', 'perjumpaan', '10/11/2011']" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('tarikh perjumpaan 10/11/2011')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tarikh', 'perjumpaan', '10-11-2011']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('tarikh perjumpaan 10-11-2011')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tarikh', 'perjumpaan', '12 mei 2011']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('tarikh perjumpaan 12 mei 2011')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['tarikh', 'perjumpaan', 'mei 12 2011']" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('tarikh perjumpaan mei 12 2011')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### time" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['jumpa', '3 am']" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('jumpa 3 am')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['jumpa', '22:00']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('jumpa 22:00')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### censored" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['f**k', 'lah']" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('f**k lah')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### emphasis" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['*damn*', 'good', 'weih']" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('*damn* good weih')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['no', 'saya', '123']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('no saya 123')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### temperature" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sejuk', 'harini', ',', '31.1c']" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('sejuk harini, 31.1c')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sejuk', 'harini', ',', '31.1C']" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('sejuk harini, 31.1C')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### distance" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['nak', 'sampai', 'lagi', '31km']" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('nak sampai lagi 31km')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['nak', 'sampai', 'lagi', '31 km']" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('nak sampai lagi 31 km')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### volume" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['botol', 'ni', '400ml']" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('botol ni 400ml')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['botol', 'ni', '400 l']" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('botol ni 400 l')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### duration" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('aku dah tunggu kau 2jam kut')" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('aku dah tunggu kau 2 jam kut')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['lagi', '10 minit', '3 jam']" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('lagi 10 minit 3 jam')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### weight" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['berat', 'kau', '60 kg']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('berat kau 60 kg')" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['berat', 'kau', '60kg']" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('berat kau 60kg')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### hypen" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sememang-memangnya', 'kau', 'sakai']" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('sememang-memangnya kau sakai')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sememang', '-', 'memangnya', 'kau', 'sakai']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.tokenize('sememang- memangnya kau sakai')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sentence tokenizer\n", + "\n", + "We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, time and month to split a sentence into multiple sentences.\n", + "\n", + "```python\n", + "class SentenceTokenizer:\n", + " def __init__(self):\n", + " pass\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "s = \"\"\"\n", + "no.1 polis bertemu dengan suspek di ladang getah. polis tembak pui pui pui bertubi tubi\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "s_tokenizer = malaya.tokenizer.SentenceTokenizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['no.1 polis bertemu dengan suspek di ladang getah.',\n", + " 'polis tembak pui pui pui bertubi tubi.']" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_tokenizer.tokenize(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "s = \"\"\"\n", + "email saya di husein.zol01@gmail.com, nanti jom berkopi\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['email saya di husein.zol01@gmail.com, nanti jom berkopi.']" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_tokenizer.tokenize(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "s = \"\"\"\n", + "ke. 2 cerita nya begini. saya berjalan jalan ditepi muara jumpa anak dara.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ke.2 cerita nya begini.',\n", + " 'saya berjalan jalan ditepi muara jumpa anak dara.']" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_tokenizer.tokenize(s)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/malaya/text/regex.py b/malaya/text/regex.py index b244b4df..49f69da7 100644 --- a/malaya/text/regex.py +++ b/malaya/text/regex.py @@ -109,6 +109,10 @@ 'number': _number, 'allcaps': r'(?. emails: bool, optional (default=True) @@ -67,6 +69,7 @@ def __init__(self, lowercase=False, **kwargs): emojis = kwargs.get('emojis', True) urls = kwargs.get('urls', True) + urls_improved = kwargs.get('urls_improved', True) tags = kwargs.get('tags', True) emails = kwargs.get('emails', True) users = kwargs.get('users', True) @@ -92,6 +95,10 @@ def __init__(self, lowercase=False, **kwargs): if urls: pipeline.append(self.regexes['url']) + if urls_improved: + pipeline.append(self.regexes['url_v2']) + pipeline.append(self.regexes['url_dperini']) + if tags: pipeline.append(self.regexes['tag']) @@ -196,6 +203,7 @@ def tokenize(self, string: str): """ escaped = html.unescape(string) tokenized = self.tok.findall(escaped) + tokenized = [t[0] if isinstance(t, tuple) else t for t in tokenized] if self.lowercase: tokenized = [t.lower() for t in tokenized]