diff --git a/docs/Word2num.rst b/docs/Word2num.rst new file mode 100644 index 00000000..92e652cf --- /dev/null +++ b/docs/Word2num.rst @@ -0,0 +1,9 @@ +Word2Num +================== + +.. note:: + + This tutorial is available as an IPython notebook + `here `_. + +.. include:: load-word2num.rst diff --git a/docs/index.rst b/docs/index.rst index 35e94eb6..b6fa8f16 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,6 +43,7 @@ Contents: Summarization Topic Toxic + Word2num Word2vec Mover Cluster diff --git a/docs/load-normalizer.rst b/docs/load-normalizer.rst index 7ea1933d..74f6d0fe 100644 --- a/docs/load-normalizer.rst +++ b/docs/load-normalizer.rst @@ -7,172 +7,176 @@ .. parsed-literal:: - CPU times: user 11.4 s, sys: 1.54 s, total: 12.9 s - Wall time: 16.7 s + CPU times: user 12.4 s, sys: 1.57 s, total: 14 s + Wall time: 17.9 s .. code:: python - string = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle' - another = 'i mmg xske mknn HUSEIN kampng tempt, i love them. pelikle' + string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle' + string2 = 'i mmg xske mknn HUSEIN kampng tmpat, i love them. pelikle saye' + string3 = 'perdana menteri ke11 sgt suka mkan ayam, harganya cuma rm15.50' + string4 = 'pada 10/4, kementerian mengumumkan' + string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni' + string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah' Load basic normalizer --------------------- .. code:: python - malaya.normalize.basic(string) - - + print(malaya.normalize.basic(string1)) + print(malaya.normalize.basic(string2)) + print(malaya.normalize.basic(string3)) + print(malaya.normalize.basic(string4)) + print(malaya.normalize.basic(string5)) + print(malaya.normalize.basic(string6)) .. parsed-literal:: - 'xjdi ke kenapa awak xsuke makan Husein kt situ tmpt i hate it pelikle' - + xjdi ke kenapa awak xsuke makan Husein kt situ tmpt saya hate it pelikle + saya mmg xske mknn Husein kampng tmpat saya love them pelikle saye + perdana menteri ke sgt suka mkan ayam harganya cuma rm + pada kementerian mengumumkan + Husein Zolkepli dapat tempat ke lumba lari hari ni + Husein Zolkepli adalah ketua kampng di kedah -Load fuzzy normalizer +Load spell normalizer --------------------- .. code:: python - malays = malaya.load_malay_dictionary() - normalizer = malaya.normalize.fuzzy(malays) + corrector = malaya.spell.probability() + normalizer = malaya.normalize.spell(corrector) .. code:: python - normalizer.normalize(string) - - + print(normalizer.normalize(string1)) + print(normalizer.normalize(string2)) + print(normalizer.normalize(string3)) + print(normalizer.normalize(string4)) + print(normalizer.normalize(string5)) + print(normalizer.normalize(string6)) .. parsed-literal:: - 'tak jadi ke kenapa awak tak suka makan Husein kat situ tempat saya hate it pelik lah' + tak jadi ke , kenapa awak tak suka makan HUSEIN kat itu mpt , saya hate it . pelik lah + saya memang tak suka makanan HUSEIN kampung tempat , saya love them . pelik lah sama + perdana menteri ke-sebelas sangat suka makan awam , harganya cuma lima belas perpuluhan lima ringgit + pada sepuluh hari bulan empat , kementerian mengumumkan + Husein Zolkepli dapat tempat ke-dua belas lumba lari hari ni + Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedai +We can see that our normalizer normalize ``ayam`` become ``awam``, this +is because we force our spelling correction to predict correct word, to +disable that, simply ``assume_wrong = False``. .. code:: python - normalizer.normalize(another) - - + %%time + normalizer.normalize(string3, assume_wrong = False) .. parsed-literal:: - 'saya memang tak saka makanan Husein kampung tempt saya love them pelik lah' + CPU times: user 505 µs, sys: 1e+03 ns, total: 506 µs + Wall time: 513 µs -Load spell normalizer ---------------------- -.. code:: python +.. parsed-literal:: + + 'perdana menteri ke-sebelas sangat suka makan ayam , harganya cuma lima belas perpuluhan lima ringgit' - normalizer = malaya.normalize.spell(malays) -To list all selected words during normalize -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: python - normalizer.normalize(string,debug=True) + %%time + normalizer.normalize(string2, assume_wrong = False) .. parsed-literal:: - [(('judi', False), 86), (('adi', False), 67), (('di', False), 80), (('jadi', False), 86)] - - [(('tepu', False), 50), (('amput', False), 67), (('tamat', False), 67), (('empat', True), 67), (('tumit', False), 67), (('ampe', False), 50), (('tipu', False), 50), (('tat', False), 57), (('top', False), 57), (('tampu', False), 67), (('topi', False), 50), (('tepi', False), 50), (('tempat', False), 80), (('umut', False), 50), (('ampo', False), 50), (('timpa', False), 67), (('impi', False), 50), (('tempe', False), 67), (('tapa', False), 50), (('taat', False), 50), (('tepet', False), 67), (('umat', False), 50), (('tepat', False), 67), (('tut', False), 57), (('tumpat', True), 80), (('tuat', False), 50), (('tampi', True), 67), (('umpat', True), 67), (('temut', False), 67), (('emat', False), 50), (('ampit', False), 67), (('amit', False), 50), (('tempo', False), 67), (('tumpu', False), 67), (('tempa', False), 67), (('empu', False), 50), (('amat', False), 50), (('taut', False), 50), (('mat', False), 57), (('tampa', False), 67), (('tuit', False), 50), (('tip', False), 57), (('ampu', False), 50), (('tapi', False), 50)] - + CPU times: user 1.54 ms, sys: 27 µs, total: 1.57 ms + Wall time: 1.59 ms .. parsed-literal:: - 'tak jadi ke kenapa awak tak suka makan Husein kat situ tempat saya hate it pelik lah' + 'saya memang tak ska makanan HUSEIN kampung tempat , saya love them . pelik lah saya' -List available deep learning stemming models --------------------------------------------- - .. code:: python - malaya.normalize.available_deep_model() - - + %%time + normalizer.normalize(string6, assume_wrong = False) .. parsed-literal:: - ['lstm', 'bahdanau', 'luong'] - + CPU times: user 450 µs, sys: 15 µs, total: 465 µs + Wall time: 482 µs -Load deep learning ------------------- - -We experimenting a lot for ``seq2seq`` models, we try to do the best -normalizer deep learning models. - -.. code:: python - - normalizer = malaya.normalize.deep_model(malays, 'bahdanau') - print(normalizer.normalize(string)) - normalizer.normalize(another) .. parsed-literal:: - jidiomik ke kenapa awak sukeesi makan Husein kat situ tempatmo saya hate it pelik lah - + 'Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedah' -.. parsed-literal:: - - 'saya memang sikeuoi maknnkano Husein kampanga tempt saya love them pelik lah' +Load fuzzy normalizer +--------------------- +.. code:: python + malays = malaya.load_malay_dictionary() + normalizer = malaya.normalize.fuzzy(malays) .. code:: python - normalizer = malaya.normalize.deep_model(malays, 'luong') - print(normalizer.normalize(string)) - normalizer.normalize(another) + %%time + normalizer.normalize(string3) .. parsed-literal:: - jadidilox ke kenapa awak sokeled makan Husein kat situ tampatgllah saya hate it pelik lah + CPU times: user 7.54 s, sys: 83 ms, total: 7.63 s + Wall time: 7.9 s .. parsed-literal:: - 'saya memang skeflleh makafnnloh Husein kampangja tempt saya love them pelik lah' + 'perdana menteri ke-sebelas sangat suka makan ayam , harganya cuma lima belas perpuluhan lima ringgit' .. code:: python - normalizer = malaya.normalize.deep_model(malays, 'lstm') - print(normalizer.normalize(string)) - normalizer.normalize(another) + %%time + normalizer.normalize(string2) .. parsed-literal:: - jajiodi ke kenapa awak sukeeia makan Husein kat situ tempatwa saya hate it pelik lah + CPU times: user 7.43 s, sys: 65.9 ms, total: 7.49 s + Wall time: 7.7 s .. parsed-literal:: - 'saya memang sekeoia makankari Husein kampangi tempt saya love them pelik lah' + 'saya memang tak saka makanan HUSEIN kampung tempat , saya love them . pelik lah saya' diff --git a/docs/load-word2num.rst b/docs/load-word2num.rst new file mode 100644 index 00000000..57eb4259 --- /dev/null +++ b/docs/load-word2num.rst @@ -0,0 +1,93 @@ + +.. code:: python + + import malaya + +.. code:: python + + malaya.word2num.word2num('dua belas') + + + + +.. parsed-literal:: + + 12 + + + +.. code:: python + + malaya.word2num.word2num('kesebelas') + + + + +.. parsed-literal:: + + 11 + + + +.. code:: python + + malaya.word2num.word2num('kesebelas') + + + + +.. parsed-literal:: + + 11 + + + +.. code:: python + + malaya.word2num.word2num('negatif kesebelas') + + + + +.. parsed-literal:: + + -11 + + + +.. code:: python + + malaya.word2num.word2num('seratus dua puluh tiga juta empat ratus lima puluh enam ribu tujuh ratus lapan puluh sembilan') + + + + +.. parsed-literal:: + + 123456789 + + + +.. code:: python + + malaya.word2num.word2num('negatif seratus dua puluh tiga juta empat ratus lima puluh enam ribu tujuh ratus lapan puluh sembilan') + + + + +.. parsed-literal:: + + -123456789 + + + +.. code:: python + + malaya.word2num.word2num('negatif satu juta dua ratus tiga puluh empat ribu lima ratus enam puluh tujuh perpuluhan lapan sembilan') + + + + +.. parsed-literal:: + + -1234567.89 diff --git a/example/normalizer/README.rst b/example/normalizer/README.rst index 033b0829..cefba78b 100644 --- a/example/normalizer/README.rst +++ b/example/normalizer/README.rst @@ -7,174 +7,178 @@ .. parsed-literal:: - CPU times: user 11.4 s, sys: 1.54 s, total: 12.9 s - Wall time: 16.7 s + CPU times: user 12.4 s, sys: 1.57 s, total: 14 s + Wall time: 17.9 s .. code:: ipython3 - string = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle' - another = 'i mmg xske mknn HUSEIN kampng tempt, i love them. pelikle' + string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle' + string2 = 'i mmg xske mknn HUSEIN kampng tmpat, i love them. pelikle saye' + string3 = 'perdana menteri ke11 sgt suka mkan ayam, harganya cuma rm15.50' + string4 = 'pada 10/4, kementerian mengumumkan' + string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni' + string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah' Load basic normalizer --------------------- .. code:: ipython3 - malaya.normalize.basic(string) - - + print(malaya.normalize.basic(string1)) + print(malaya.normalize.basic(string2)) + print(malaya.normalize.basic(string3)) + print(malaya.normalize.basic(string4)) + print(malaya.normalize.basic(string5)) + print(malaya.normalize.basic(string6)) .. parsed-literal:: - 'xjdi ke kenapa awak xsuke makan Husein kt situ tmpt i hate it pelikle' - + xjdi ke kenapa awak xsuke makan Husein kt situ tmpt saya hate it pelikle + saya mmg xske mknn Husein kampng tmpat saya love them pelikle saye + perdana menteri ke sgt suka mkan ayam harganya cuma rm + pada kementerian mengumumkan + Husein Zolkepli dapat tempat ke lumba lari hari ni + Husein Zolkepli adalah ketua kampng di kedah -Load fuzzy normalizer +Load spell normalizer --------------------- .. code:: ipython3 - malays = malaya.load_malay_dictionary() - normalizer = malaya.normalize.fuzzy(malays) + corrector = malaya.spell.probability() + normalizer = malaya.normalize.spell(corrector) .. code:: ipython3 - normalizer.normalize(string) - - + print(normalizer.normalize(string1)) + print(normalizer.normalize(string2)) + print(normalizer.normalize(string3)) + print(normalizer.normalize(string4)) + print(normalizer.normalize(string5)) + print(normalizer.normalize(string6)) .. parsed-literal:: - 'tak jadi ke kenapa awak tak suka makan Husein kat situ tempat saya hate it pelik lah' + tak jadi ke , kenapa awak tak suka makan HUSEIN kat itu mpt , saya hate it . pelik lah + saya memang tak suka makanan HUSEIN kampung tempat , saya love them . pelik lah sama + perdana menteri ke-sebelas sangat suka makan awam , harganya cuma lima belas perpuluhan lima ringgit + pada sepuluh hari bulan empat , kementerian mengumumkan + Husein Zolkepli dapat tempat ke-dua belas lumba lari hari ni + Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedai +We can see that our normalizer normalize ``ayam`` become ``awam``, this +is because we force our spelling correction to predict correct word, to +disable that, simply ``assume_wrong = False``. .. code:: ipython3 - normalizer.normalize(another) - - + %%time + normalizer.normalize(string3, assume_wrong = False) .. parsed-literal:: - 'saya memang tak saka makanan Husein kampung tempt saya love them pelik lah' + CPU times: user 505 µs, sys: 1e+03 ns, total: 506 µs + Wall time: 513 µs -Load spell normalizer ---------------------- -.. code:: ipython3 +.. parsed-literal:: + + 'perdana menteri ke-sebelas sangat suka makan ayam , harganya cuma lima belas perpuluhan lima ringgit' - normalizer = malaya.normalize.spell(malays) -To list all selected words during normalize -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code:: ipython3 - normalizer.normalize(string,debug=True) + %%time + normalizer.normalize(string2, assume_wrong = False) .. parsed-literal:: - [(('judi', False), 86), (('adi', False), 67), (('di', False), 80), (('jadi', False), 86)] - - [(('tepu', False), 50), (('amput', False), 67), (('tamat', False), 67), (('empat', True), 67), (('tumit', False), 67), (('ampe', False), 50), (('tipu', False), 50), (('tat', False), 57), (('top', False), 57), (('tampu', False), 67), (('topi', False), 50), (('tepi', False), 50), (('tempat', False), 80), (('umut', False), 50), (('ampo', False), 50), (('timpa', False), 67), (('impi', False), 50), (('tempe', False), 67), (('tapa', False), 50), (('taat', False), 50), (('tepet', False), 67), (('umat', False), 50), (('tepat', False), 67), (('tut', False), 57), (('tumpat', True), 80), (('tuat', False), 50), (('tampi', True), 67), (('umpat', True), 67), (('temut', False), 67), (('emat', False), 50), (('ampit', False), 67), (('amit', False), 50), (('tempo', False), 67), (('tumpu', False), 67), (('tempa', False), 67), (('empu', False), 50), (('amat', False), 50), (('taut', False), 50), (('mat', False), 57), (('tampa', False), 67), (('tuit', False), 50), (('tip', False), 57), (('ampu', False), 50), (('tapi', False), 50)] - + CPU times: user 1.54 ms, sys: 27 µs, total: 1.57 ms + Wall time: 1.59 ms .. parsed-literal:: - 'tak jadi ke kenapa awak tak suka makan Husein kat situ tempat saya hate it pelik lah' - + 'saya memang tak ska makanan HUSEIN kampung tempat , saya love them . pelik lah saya' -List available deep learning stemming models --------------------------------------------- .. code:: ipython3 - malaya.normalize.available_deep_model() - - + %%time + normalizer.normalize(string6, assume_wrong = False) .. parsed-literal:: - ['lstm', 'bahdanau', 'luong'] + CPU times: user 450 µs, sys: 15 µs, total: 465 µs + Wall time: 482 µs -Load deep learning ------------------- - -We experimenting a lot for ``seq2seq`` models, we try to do the best -normalizer deep learning models. - -.. code:: ipython3 - - normalizer = malaya.normalize.deep_model(malays, 'bahdanau') - print(normalizer.normalize(string)) - normalizer.normalize(another) - .. parsed-literal:: - jidiomik ke kenapa awak sukeesi makan Husein kat situ tempatmo saya hate it pelik lah - + 'Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedah' -.. parsed-literal:: - - 'saya memang sikeuoi maknnkano Husein kampanga tempt saya love them pelik lah' +Load fuzzy normalizer +--------------------- +.. code:: ipython3 + malays = malaya.load_malay_dictionary() + normalizer = malaya.normalize.fuzzy(malays) .. code:: ipython3 - normalizer = malaya.normalize.deep_model(malays, 'luong') - print(normalizer.normalize(string)) - normalizer.normalize(another) + %%time + normalizer.normalize(string3) .. parsed-literal:: - jadidilox ke kenapa awak sokeled makan Husein kat situ tampatgllah saya hate it pelik lah + CPU times: user 7.54 s, sys: 83 ms, total: 7.63 s + Wall time: 7.9 s .. parsed-literal:: - 'saya memang skeflleh makafnnloh Husein kampangja tempt saya love them pelik lah' + 'perdana menteri ke-sebelas sangat suka makan ayam , harganya cuma lima belas perpuluhan lima ringgit' .. code:: ipython3 - normalizer = malaya.normalize.deep_model(malays, 'lstm') - print(normalizer.normalize(string)) - normalizer.normalize(another) + %%time + normalizer.normalize(string2) .. parsed-literal:: - jajiodi ke kenapa awak sukeeia makan Husein kat situ tempatwa saya hate it pelik lah + CPU times: user 7.43 s, sys: 65.9 ms, total: 7.49 s + Wall time: 7.7 s .. parsed-literal:: - 'saya memang sekeoia makankari Husein kampangi tempt saya love them pelik lah' + 'saya memang tak saka makanan HUSEIN kampung tempat , saya love them . pelik lah saya' diff --git a/example/normalizer/load-normalizer.ipynb b/example/normalizer/load-normalizer.ipynb index 6627e911..bba110ca 100644 --- a/example/normalizer/load-normalizer.ipynb +++ b/example/normalizer/load-normalizer.ipynb @@ -9,8 +9,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 11.4 s, sys: 1.54 s, total: 12.9 s\n", - "Wall time: 16.7 s\n" + "CPU times: user 12.4 s, sys: 1.57 s, total: 14 s\n", + "Wall time: 17.9 s\n" ] } ], @@ -25,8 +25,12 @@ "metadata": {}, "outputs": [], "source": [ - "string = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle'\n", - "another = 'i mmg xske mknn HUSEIN kampng tempt, i love them. pelikle'" + "string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle'\n", + "string2 = 'i mmg xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'\n", + "string3 = 'perdana menteri ke11 sgt suka mkan ayam, harganya cuma rm15.50'\n", + "string4 = 'pada 10/4, kementerian mengumumkan'\n", + "string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'\n", + "string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah'" ] }, { @@ -42,25 +46,32 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'xjdi ke kenapa awak xsuke makan Husein kt situ tmpt i hate it pelikle'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "xjdi ke kenapa awak xsuke makan Husein kt situ tmpt saya hate it pelikle\n", + "saya mmg xske mknn Husein kampng tmpat saya love them pelikle saye\n", + "perdana menteri ke sgt suka mkan ayam harganya cuma rm\n", + "pada kementerian mengumumkan\n", + "Husein Zolkepli dapat tempat ke lumba lari hari ni\n", + "Husein Zolkepli adalah ketua kampng di kedah\n" + ] } ], "source": [ - "malaya.normalize.basic(string)" + "print(malaya.normalize.basic(string1))\n", + "print(malaya.normalize.basic(string2))\n", + "print(malaya.normalize.basic(string3))\n", + "print(malaya.normalize.basic(string4))\n", + "print(malaya.normalize.basic(string5))\n", + "print(malaya.normalize.basic(string6))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load fuzzy normalizer" + "## Load spell normalizer" ] }, { @@ -69,8 +80,8 @@ "metadata": {}, "outputs": [], "source": [ - "malays = malaya.load_malay_dictionary()\n", - "normalizer = malaya.normalize.fuzzy(malays)" + "corrector = malaya.spell.probability()\n", + "normalizer = malaya.normalize.spell(corrector)" ] }, { @@ -79,18 +90,32 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'tak jadi ke kenapa awak tak suka makan Husein kat situ tempat saya hate it pelik lah'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "tak jadi ke , kenapa awak tak suka makan HUSEIN kat itu mpt , saya hate it . pelik lah\n", + "saya memang tak suka makanan HUSEIN kampung tempat , saya love them . pelik lah sama\n", + "perdana menteri ke-sebelas sangat suka makan awam , harganya cuma lima belas perpuluhan lima ringgit\n", + "pada sepuluh hari bulan empat , kementerian mengumumkan\n", + "Husein Zolkepli dapat tempat ke-dua belas lumba lari hari ni\n", + "Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedai\n" + ] } ], "source": [ - "normalizer.normalize(string)" + "print(normalizer.normalize(string1))\n", + "print(normalizer.normalize(string2))\n", + "print(normalizer.normalize(string3))\n", + "print(normalizer.normalize(string4))\n", + "print(normalizer.normalize(string5))\n", + "print(normalizer.normalize(string6))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that our normalizer normalize `ayam` become `awam`, this is because we force our spelling correction to predict correct word, to disable that, simply `assume_wrong = False`." ] }, { @@ -98,10 +123,18 @@ "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 505 µs, sys: 1e+03 ns, total: 506 µs\n", + "Wall time: 513 µs\n" + ] + }, { "data": { "text/plain": [ - "'saya memang tak saka makanan Husein kampung tempt saya love them pelik lah'" + "'perdana menteri ke-sebelas sangat suka makan ayam , harganya cuma lima belas perpuluhan lima ringgit'" ] }, "execution_count": 6, @@ -110,101 +143,83 @@ } ], "source": [ - "normalizer.normalize(another)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load spell normalizer" + "%%time\n", + "normalizer.normalize(string3, assume_wrong = False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [ - "normalizer = malaya.normalize.spell(malays)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### To list all selected words during normalize" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[(('judi', False), 86), (('adi', False), 67), (('di', False), 80), (('jadi', False), 86)] \n", - "\n", - "[(('tepu', False), 50), (('amput', False), 67), (('tamat', False), 67), (('empat', True), 67), (('tumit', False), 67), (('ampe', False), 50), (('tipu', False), 50), (('tat', False), 57), (('top', False), 57), (('tampu', False), 67), (('topi', False), 50), (('tepi', False), 50), (('tempat', False), 80), (('umut', False), 50), (('ampo', False), 50), (('timpa', False), 67), (('impi', False), 50), (('tempe', False), 67), (('tapa', False), 50), (('taat', False), 50), (('tepet', False), 67), (('umat', False), 50), (('tepat', False), 67), (('tut', False), 57), (('tumpat', True), 80), (('tuat', False), 50), (('tampi', True), 67), (('umpat', True), 67), (('temut', False), 67), (('emat', False), 50), (('ampit', False), 67), (('amit', False), 50), (('tempo', False), 67), (('tumpu', False), 67), (('tempa', False), 67), (('empu', False), 50), (('amat', False), 50), (('taut', False), 50), (('mat', False), 57), (('tampa', False), 67), (('tuit', False), 50), (('tip', False), 57), (('ampu', False), 50), (('tapi', False), 50)] \n", - "\n" + "CPU times: user 1.54 ms, sys: 27 µs, total: 1.57 ms\n", + "Wall time: 1.59 ms\n" ] }, { "data": { "text/plain": [ - "'tak jadi ke kenapa awak tak suka makan Husein kat situ tempat saya hate it pelik lah'" + "'saya memang tak ska makanan HUSEIN kampung tempat , saya love them . pelik lah saya'" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "normalizer.normalize(string,debug=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## List available deep learning stemming models" + "%%time\n", + "normalizer.normalize(string2, assume_wrong = False)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 450 µs, sys: 15 µs, total: 465 µs\n", + "Wall time: 482 µs\n" + ] + }, { "data": { "text/plain": [ - "['lstm', 'bahdanau', 'luong']" + "'Husein Zolkepli ( dua ribu sebelas hingga dua ribu sembilan belas ) adalah ketua kampung di kedah'" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "malaya.normalize.available_deep_model()" + "%%time\n", + "normalizer.normalize(string6, assume_wrong = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load deep learning" + "## Load fuzzy normalizer" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 9, "metadata": {}, + "outputs": [], "source": [ - "We experimenting a lot for `seq2seq` models, we try to do the best normalizer deep learning models." + "malays = malaya.load_malay_dictionary()\n", + "normalizer = malaya.normalize.fuzzy(malays)" ] }, { @@ -216,13 +231,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "jidiomik ke kenapa awak sukeesi makan Husein kat situ tempatmo saya hate it pelik lah\n" + "CPU times: user 7.54 s, sys: 83 ms, total: 7.63 s\n", + "Wall time: 7.9 s\n" ] }, { "data": { "text/plain": [ - "'saya memang sikeuoi maknnkano Husein kampanga tempt saya love them pelik lah'" + "'perdana menteri ke-sebelas sangat suka makan ayam , harganya cuma lima belas perpuluhan lima ringgit'" ] }, "execution_count": 10, @@ -231,9 +247,8 @@ } ], "source": [ - "normalizer = malaya.normalize.deep_model(malays, 'bahdanau')\n", - "print(normalizer.normalize(string))\n", - "normalizer.normalize(another)" + "%%time\n", + "normalizer.normalize(string3)" ] }, { @@ -245,13 +260,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "jadidilox ke kenapa awak sokeled makan Husein kat situ tampatgllah saya hate it pelik lah\n" + "CPU times: user 7.43 s, sys: 65.9 ms, total: 7.49 s\n", + "Wall time: 7.7 s\n" ] }, { "data": { "text/plain": [ - "'saya memang skeflleh makafnnloh Husein kampangja tempt saya love them pelik lah'" + "'saya memang tak saka makanan HUSEIN kampung tempat , saya love them . pelik lah saya'" ] }, "execution_count": 11, @@ -260,46 +276,9 @@ } ], "source": [ - "normalizer = malaya.normalize.deep_model(malays, 'luong')\n", - "print(normalizer.normalize(string))\n", - "normalizer.normalize(another)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "jajiodi ke kenapa awak sukeeia makan Husein kat situ tempatwa saya hate it pelik lah\n" - ] - }, - { - "data": { - "text/plain": [ - "'saya memang sekeoia makankari Husein kampangi tempt saya love them pelik lah'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "normalizer = malaya.normalize.deep_model(malays, 'lstm')\n", - "print(normalizer.normalize(string))\n", - "normalizer.normalize(another)" + "%%time\n", + "normalizer.normalize(string2)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/example/word2num/README.rst b/example/word2num/README.rst new file mode 100644 index 00000000..df39b90f --- /dev/null +++ b/example/word2num/README.rst @@ -0,0 +1,95 @@ + +.. code:: ipython3 + + import malaya + +.. code:: ipython3 + + malaya.word2num.word2num('dua belas') + + + + +.. parsed-literal:: + + 12 + + + +.. code:: ipython3 + + malaya.word2num.word2num('kesebelas') + + + + +.. parsed-literal:: + + 11 + + + +.. code:: ipython3 + + malaya.word2num.word2num('kesebelas') + + + + +.. parsed-literal:: + + 11 + + + +.. code:: ipython3 + + malaya.word2num.word2num('negatif kesebelas') + + + + +.. parsed-literal:: + + -11 + + + +.. code:: ipython3 + + malaya.word2num.word2num('seratus dua puluh tiga juta empat ratus lima puluh enam ribu tujuh ratus lapan puluh sembilan') + + + + +.. parsed-literal:: + + 123456789 + + + +.. code:: ipython3 + + malaya.word2num.word2num('negatif seratus dua puluh tiga juta empat ratus lima puluh enam ribu tujuh ratus lapan puluh sembilan') + + + + +.. parsed-literal:: + + -123456789 + + + +.. code:: ipython3 + + malaya.word2num.word2num('negatif satu juta dua ratus tiga puluh empat ribu lima ratus enam puluh tujuh perpuluhan lapan sembilan') + + + + +.. parsed-literal:: + + -1234567.89 + + diff --git a/example/word2num/load-word2num.ipynb b/example/word2num/load-word2num.ipynb new file mode 100644 index 00000000..73b7d726 --- /dev/null +++ b/example/word2num/load-word2num.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import malaya" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('dua belas')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('kesebelas')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('kesebelas')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-11" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('negatif kesebelas')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "123456789" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('seratus dua puluh tiga juta empat ratus lima puluh enam ribu tujuh ratus lapan puluh sembilan')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-123456789" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('negatif seratus dua puluh tiga juta empat ratus lima puluh enam ribu tujuh ratus lapan puluh sembilan')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-1234567.89" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.word2num.word2num('negatif satu juta dua ratus tiga puluh empat ribu lima ratus enam puluh tujuh perpuluhan lapan sembilan')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/malaya/__init__.py b/malaya/__init__.py index eb3774a0..8fbd66b4 100644 --- a/malaya/__init__.py +++ b/malaya/__init__.py @@ -18,7 +18,7 @@ home = os.path.join(str(Path.home()), 'Malaya') version = '2.3' -bump_version = '2.3.4' +bump_version = '2.3.5' version_path = os.path.join(home, 'version') @@ -311,5 +311,6 @@ def describe_dependency(): from . import topic_model from . import toxic from . import word_mover +from . import word2num from . import word2vec from .texts import vectorizer diff --git a/malaya/normalize.py b/malaya/normalize.py index d763e090..634c72bd 100644 --- a/malaya/normalize.py +++ b/malaya/normalize.py @@ -6,9 +6,9 @@ import numpy as np import json +import re from fuzzywuzzy import fuzz from unidecode import unidecode -from collections import Counter from ._utils._utils import ( load_graph, check_file, @@ -30,123 +30,89 @@ PAD, EOS, UNK, + hujung_malaysian, + calon_dictionary, ) -from .spell import _return_possible, _edit_normalizer, _return_known -from .similarity import is_location -from ._utils._paths import MALAY_TEXT, PATH_NORMALIZER, S3_PATH_NORMALIZER - - -class _DEEP_NORMALIZER: - def __init__(self, x, logits, sess, dicts, corpus): - self._sess = sess - self._x = x - self._logits = logits - self._dicts = dicts - self._dicts['rev_dictionary_to'] = { - int(k): v for k, v in self._dicts['rev_dictionary_to'].items() - } - self.corpus = corpus - - def normalize(self, string, check_english = True): - """ - Normalize a string. - - Parameters - ---------- - string : str - - Returns - ------- - string: normalized string - """ - if not isinstance(string, str): - raise ValueError('input must be a string') - if not isinstance(check_english, bool): - raise ValueError('check_english must be a boolean') - - token_strings = normalizer_textcleaning(string).split() - results, need_to_normalize = [], [] - for word in token_strings: - if word.istitle(): - results.append(word) - continue - if check_english: - if word in ENGLISH_WORDS: - results.append(word) - continue - if word[0] == 'x' and len(word) > 1: - result_string = 'tak ' - word = word[1:] - else: - result_string = '' - if word[-2:] == 'la': - end_result_string = ' lah' - word = word[:-2] - elif word[-3:] == 'lah': - end_result_string = ' lah' - word = word[:-3] - else: - end_result_string = '' - if word in sounds: - results.append(result_string + sounds[word] + end_result_string) - continue - if word in rules_normalizer: - results.append( - result_string + rules_normalizer[word] + end_result_string - ) - continue - if word in self.corpus: - results.append(result_string + word + end_result_string) - continue - results.append('replace__me__') - need_to_normalize.append(word) +from .num2word import to_cardinal +from .word2num import word2num +from .preprocessing import _SocialTokenizer + +_tokenizer = _SocialTokenizer().tokenize +ignore_words = ['ringgit', 'sen'] +ignore_postfix = ['adalah'] + + +def _remove_postfix(word): + if word in ignore_postfix: + return word, '' + for p in hujung_malaysian: + if word.endswith(p): + return word[: -len(p)], ' lah' + return word, '' + + +def _normalize_ke(word): + # kesebelas -> ke-sebelas + # ke-21 -> ke-dua puluh satu + if word.startswith('ke'): + original = word + word = word.replace('-', '') + word = word.split('ke') + try: + num = word2num(word[1]) + except: + pass + try: + num = int(word[1]) + except: + return original + return 'ke-' + to_cardinal(num) + return word + + +def _normalize_title(word): + if word.istitle() or word.isupper(): + return calon_dictionary.get(word, word) + return word + + +def _is_number_regex(s): + if re.match('^\d+?\.\d+?$', s) is None: + return s.isdigit() + return True + + +def _string_to_num(word): + if '.' in word: + return float(word) + else: + return int(word) - normalized = [] - if len(need_to_normalize): - idx = stemmer_str_idx( - need_to_normalize, self._dicts['dictionary_from'] - ) - predicted = self._sess.run( - self._logits, - feed_dict = {self._x: pad_sentence_batch(idx, PAD)[0]}, - ) - for word in predicted: - normalized.append( - ''.join( - [ - self._dicts['rev_dictionary_to'][c] - for c in word - if c not in [GO, PAD, EOS, UNK] - ] - ) - ) - cp_results, current_replace = [], 0 - for i in range(len(results)): - if 'replace__me__' in results[i]: - if current_replace < len(normalized): - results[i] = normalized[current_replace] - cp_results.append(results[i]) - current_replace += 1 - else: - cp_results.append(results[i]) - return ' '.join(cp_results) +def _normalized_money(word): + original = word + word = word.lower() + if word[:2] == 'rm' and _is_number_regex(word[2:]): + return to_cardinal(_string_to_num(word[2:])) + ' ringgit' + elif word[-3:] == 'sen': + return to_cardinal(_string_to_num(word[:-3])) + ' sen' + else: + return original class _SPELL_NORMALIZE: - def __init__(self, corpus): - self.corpus = Counter(corpus) + def __init__(self, speller): + self._speller = speller - def normalize(self, string, debug = True, check_english = True): + def normalize(self, string, assume_wrong = True, check_english = True): """ Normalize a string Parameters ---------- string : str - - debug : bool, optional (default=True) - If true, it will print character similarity distances. + assume_wrong: bool, (default=True) + force speller to predict. check_english: bool, (default=True) check a word in english dictionary. @@ -156,19 +122,32 @@ def normalize(self, string, debug = True, check_english = True): """ if not isinstance(string, str): raise ValueError('input must be a string') - if not isinstance(debug, bool): - raise ValueError('debug must be a boolean') if not isinstance(check_english, bool): raise ValueError('check_english must be a boolean') + if not isinstance(assume_wrong, bool): + raise ValueError('assume_wrong must be a boolean') result = [] - for word in normalizer_textcleaning(string).split(): - if word.istitle(): + tokenized = _tokenizer(string) + index = 0 + while index < len(tokenized): + word = tokenized[index] + if len(word) < 2 and word not in sounds: + result.append(word) + index += 1 + continue + if word.lower() in ignore_words: result.append(word) + index += 1 + continue + if word.istitle() or word.isupper(): + result.append(_normalize_title(word)) + index += 1 continue if check_english: - if word in ENGLISH_WORDS: + if word.lower() in ENGLISH_WORDS: result.append(word) + index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': @@ -178,49 +157,70 @@ def normalize(self, string, debug = True, check_english = True): word = word[1:] else: result_string = '' - if word[-2:] == 'la': - end_result_string = ' lah' - word = word[:-2] - elif word[-3:] == 'lah': - end_result_string = ' lah' - word = word[:-3] - else: - end_result_string = '' + if word.lower() == 'ke' and index < (len(tokenized) - 2): + if tokenized[index + 1] == '-' and _is_number_regex( + tokenized[index + 2] + ): + result.append( + _normalize_ke( + word + tokenized[index + 1] + tokenized[index + 2] + ) + ) + index += 3 + continue + normalized_ke = _normalize_ke(word) + if normalized_ke != word: + result.append(normalized_ke) + index += 1 + continue + if _is_number_regex(word) and index < (len(tokenized) - 2): + if tokenized[index + 1] == '-' and _is_number_regex( + tokenized[index + 2] + ): + result.append( + to_cardinal(_string_to_num(word)) + + ' hingga ' + + to_cardinal(_string_to_num(tokenized[index + 2])) + ) + index += 3 + continue + if word.lower() == 'pada' and index < (len(tokenized) - 3): + if ( + _is_number_regex(tokenized[index + 1]) + and tokenized[index + 2] in '/-' + and _is_number_regex(tokenized[index + 3]) + ): + result.append( + 'pada %s hari bulan %s' + % ( + to_cardinal(_string_to_num(tokenized[index + 1])), + to_cardinal(_string_to_num(tokenized[index + 3])), + ) + ) + index += 4 + continue + money = _normalized_money(word) + if money != word: + result.append(money) + index += 1 + continue + + word, end_result_string = _remove_postfix(word) if word in sounds: result.append(result_string + sounds[word] + end_result_string) + index += 1 continue if word in rules_normalizer: result.append( result_string + rules_normalizer[word] + end_result_string ) + index += 1 continue - if word in self.corpus: - result.append(result_string + word + end_result_string) - continue - candidates = ( - _return_known([word], self.corpus) - or _return_known(_edit_normalizer(word), self.corpus) - or _return_possible(word, self.corpus, _edit_normalizer) - or [word] - ) - candidates = list(candidates) - candidates = [ - (candidate, is_location(candidate)) - for candidate in list(candidates) - ] - if debug: - print([(k, fuzz.ratio(word, k[0])) for k in candidates], '\n') - strings = [fuzz.ratio(word, k[0]) for k in candidates] - descending_sort = np.argsort(strings)[::-1] - selected = None - for index in descending_sort: - if not candidates[index][1]: - selected = candidates[index][0] - break - selected = ( - candidates[descending_sort[0]][0] if not selected else selected + selected = self._speller.correct( + word, debug = False, assume_wrong = assume_wrong ) result.append(result_string + selected + end_result_string) + index += 1 return ' '.join(result) @@ -231,7 +231,7 @@ def __init__(self, normalized, corpus): def normalize(self, string, fuzzy_ratio = 70, check_english = True): """ - Normalize a string. + Normalize a string Parameters ---------- @@ -247,19 +247,32 @@ def normalize(self, string, fuzzy_ratio = 70, check_english = True): """ if not isinstance(string, str): raise ValueError('input must be a string') - if not isinstance(fuzzy_ratio, int): - raise ValueError('fuzzy_ratio must be an integer') if not isinstance(check_english, bool): raise ValueError('check_english must be a boolean') + if not isinstance(fuzzy_ratio, int): + raise ValueError('fuzzy_ratio must be an integer') result = [] - for word in normalizer_textcleaning(string).split(): - if word.istitle(): + tokenized = _tokenizer(string) + index = 0 + while index < len(tokenized): + word = tokenized[index] + if len(word) < 2 and word not in sounds: result.append(word) + index += 1 + continue + if word.lower() in ignore_words: + result.append(word) + index += 1 + continue + if word.istitle() or word.isupper(): + result.append(_normalize_title(word)) + index += 1 continue if check_english: - if word in ENGLISH_WORDS: + if word.lower() in ENGLISH_WORDS: result.append(word) + index += 1 continue if len(word) > 2: if word[-2] in consonants and word[-1] == 'e': @@ -269,24 +282,64 @@ def normalize(self, string, fuzzy_ratio = 70, check_english = True): word = word[1:] else: result_string = '' - if word[-2:] == 'la': - end_result_string = ' lah' - word = word[:-2] - elif word[-3:] == 'lah': - end_result_string = ' lah' - word = word[:-3] - else: - end_result_string = '' + if word.lower() == 'ke' and index < (len(tokenized) - 2): + if tokenized[index + 1] == '-' and _is_number_regex( + tokenized[index + 2] + ): + result.append( + _normalize_ke( + word + tokenized[index + 1] + tokenized[index + 2] + ) + ) + index += 3 + continue + normalized_ke = _normalize_ke(word) + if normalized_ke != word: + result.append(normalized_ke) + index += 1 + continue + if _is_number_regex(word) and index < (len(tokenized) - 2): + if tokenized[index + 1] == '-' and _is_number_regex( + tokenized[index + 2] + ): + result.append( + to_cardinal(_string_to_num(word)) + + ' hingga ' + + to_cardinal(_string_to_num(tokenized[index + 2])) + ) + index += 3 + continue + if word.lower() == 'pada' and index < (len(tokenized) - 3): + if ( + _is_number_regex(tokenized[index + 1]) + and tokenized[index + 2] in '/-' + and _is_number_regex(tokenized[index + 3]) + ): + result.append( + 'pada %s hari bulan %s' + % ( + to_cardinal(_string_to_num(tokenized[index + 1])), + to_cardinal(_string_to_num(tokenized[index + 3])), + ) + ) + index += 4 + continue + money = _normalized_money(word) + if money != word: + result.append(money) + index += 1 + continue + + word, end_result_string = _remove_postfix(word) if word in sounds: result.append(result_string + sounds[word] + end_result_string) + index += 1 continue if word in rules_normalizer: result.append( result_string + rules_normalizer[word] + end_result_string ) - continue - if word in self.corpus: - result.append(result_string + word + end_result_string) + index += 1 continue results = [] for i in range(len(self.normalized)): @@ -301,6 +354,7 @@ def normalize(self, string, fuzzy_ratio = 70, check_english = True): + self.corpus[np.argmax(results)] + end_result_string ) + index += 1 return ' '.join(result) @@ -351,23 +405,25 @@ def fuzzy(corpus): return _FUZZY_NORMALIZE(transform, corpus) -def spell(corpus): +def spell(speller): """ Train a Spelling Normalizer Parameters ---------- - corpus : list of strings. Prefer to feed with malaya.load_malay_dictionary(). + speller : Malaya spelling correction object Returns ------- - SPELL_NORMALIZE: Trained malaya.normalizer._SPELL_NORMALIZE class + SPELL_NORMALIZE: malaya.normalizer._SPELL_NORMALIZE class """ - if not isinstance(corpus, list): - raise ValueError('corpus must be a list') - if not isinstance(corpus[0], str): - raise ValueError('corpus must be list of strings') - return _SPELL_NORMALIZE([unidecode(w) for w in corpus]) + if not hasattr(speller, 'correct') and not hasattr( + speller, 'normalize_elongated' + ): + raise ValueError( + 'speller must has `correct` or `normalize_elongated` method' + ) + return _SPELL_NORMALIZE(speller) def basic(string): @@ -396,53 +452,3 @@ def basic(string): else: result.append(word) return ' '.join(result) - - -def available_deep_model(): - """ - List available deep learning stemming models. - """ - return ['lstm', 'bahdanau', 'luong'] - - -def deep_model(corpus, model = 'bahdanau', validate = True): - """ - Load deep-learning model to normalize a string. - - Parameters - ---------- - validate: bool, optional (default=True) - if True, malaya will check model availability and download if not available. - - Returns - ------- - DEEP_NORMALIZER: malaya.normalizer._DEEP_NORMALIZER class - - """ - if not isinstance(corpus, list): - raise ValueError('corpus must be a list') - if not isinstance(corpus[0], str): - raise ValueError('corpus must be list of strings') - if validate: - check_file(PATH_NORMALIZER[model], S3_PATH_NORMALIZER[model]) - else: - if not check_available(PATH_NORMALIZER[model]): - raise Exception( - 'normalizer is not available, please `validate = True`' - ) - try: - with open(PATH_NORMALIZER[model]['setting'], 'r') as fopen: - dic_normalizer = json.load(fopen) - g = load_graph(PATH_NORMALIZER[model]['model']) - except: - raise Exception( - "model corrupted due to some reasons, please run malaya.clear_cache('normalizer/%s') and try again" - % (model) - ) - return _DEEP_NORMALIZER( - g.get_tensor_by_name('import/Placeholder:0'), - g.get_tensor_by_name('import/logits:0'), - generate_session(graph = g), - dic_normalizer, - [unidecode(w) for w in corpus], - ) diff --git a/malaya/spell.py b/malaya/spell.py index e7edf730..38b0ae0d 100644 --- a/malaya/spell.py +++ b/malaya/spell.py @@ -88,7 +88,7 @@ def __init__(self, corpus): self.occurences = _build_dicts(self.corpus) self.corpus = Counter(corpus) - def correct(self, string, first_char = True, debug = True): + def correct(self, string, first_char = True, debug = True, **kwargs): """ Correct a word. @@ -268,7 +268,7 @@ def edit_candidates(self, word, assume_wrong = False, fast = True): ) @lru_cache(maxsize = 65536) - def correct(self, word, assume_wrong = False, fast = False): + def correct(self, word, assume_wrong = False, fast = False, **kwargs): """ Most probable spelling correction for word. """ diff --git a/malaya/texts/_tatabahasa.py b/malaya/texts/_tatabahasa.py index 1c2212d2..04d490fd 100644 --- a/malaya/texts/_tatabahasa.py +++ b/malaya/texts/_tatabahasa.py @@ -230,6 +230,9 @@ 'loh', 'lohh', ] + +hujung_malaysian = ['lah', 'la', 'ler'] + alphabet = 'qwertyuiopasdfghjklzxcvbnm' consonants = 'bcdfghjklmnpqrstvwxyz' vowels = 'aeiou' @@ -246,6 +249,7 @@ 't': 'nanti', 'p': 'pergi', 'wai': 'kenapa', + 'i': 'saya', } tatabahasa_dict = { @@ -1018,6 +1022,19 @@ 'dato', 'dsp', ] + +calon_dictionary = { + 'dr': 'Doktor', + 'yb': 'Yang Berhormat', + 'hj': 'Haji', + 'ybm': 'Yang Berhormat Mulia', + 'tyt': 'Tuan Yang Terutama', + 'yab': 'Yang Berhormat', + 'ybm': 'Yang Berhormat Mulia', + 'yabhg': 'Yang Amat Berbahagia', + 'ybhg': 'Yang Berbahagia', + 'miss': 'Cik', +} stopwords = [ 'ada', 'inikah', diff --git a/malaya/word2num.py b/malaya/word2num.py new file mode 100644 index 00000000..2a49b862 --- /dev/null +++ b/malaya/word2num.py @@ -0,0 +1,217 @@ +from __future__ import print_function, unicode_literals + +import sys +import warnings + +if not sys.warnoptions: + warnings.simplefilter('ignore') + +malaysian_number_system = { + 'kosong': 0, + 'satu': 1, + 'dua': 2, + 'tiga': 3, + 'empat': 4, + 'lima': 5, + 'enam': 6, + 'tujuh': 7, + 'lapan': 8, + 'sembilan': 9, + 'sepuluh': 10, + 'seribu': 1000, + 'sejuta': 1000000, + 'seratus': 100, + 'sebelas': 11, + 'ratus': 100, + 'ribu': 1000, + 'juta': 1000000, + 'bilion': 1000000000, + 'perpuluhan': '.', + 'negatif': -1, + 'belas': 10, + 'puluh': 10, + 'pertama': 1, +} + +decimal_words = [ + 'kosong', + 'satu', + 'dua', + 'tiga', + 'empat', + 'lima', + 'enam', + 'tujuh', + 'lapan', + 'sembilan', +] + + +def _get_decimal_sum(decimal_digit_words): + decimal_number_str = [] + for dec_word in decimal_digit_words: + if dec_word not in decimal_words: + return 0 + else: + decimal_number_str.append(malaysian_number_system[dec_word]) + final_decimal_string = '0.' + ''.join(map(str, decimal_number_str)) + return float(final_decimal_string) + + +def _number_formation(number_words): + numbers = [] + belas = False + for number_word in number_words: + if number_word in ['belas', 'sebelas']: + belas = True + numbers.append(malaysian_number_system[number_word]) + if len(numbers) == 5: + return ( + (numbers[0] * numbers[1]) + (numbers[2] * numbers[3]) + numbers[4] + ) + elif len(numbers) == 4: + if numbers[0] == 100: + return numbers[0] + (numbers[1] * numbers[2]) + numbers[3] + return (numbers[0] * numbers[1]) + numbers[2] + numbers[3] + elif len(numbers) == 3: + if belas: + return numbers[0] + numbers[1] + numbers[2] + return numbers[0] * numbers[1] + numbers[2] + elif len(numbers) == 2: + if 100 in numbers or 10 in numbers: + if belas: + return numbers[0] + numbers[1] + return numbers[0] * numbers[1] + else: + return numbers[0] + numbers[1] + else: + return numbers[0] + + +def word2num(string): + if not isinstance(string, str): + raise ValueError('input must be a string') + + string = string.replace('-', ' ') + string = string.replace('ke', '') + string = string.replace('dan', '') + string = string.lower() + + if string.isdigit(): + return int(string) + + split_words = string.strip().split() + + clean_numbers = [] + clean_decimal_numbers = [] + + for word in split_words: + if word in malaysian_number_system: + clean_numbers.append(word) + + if not len(clean_numbers): + raise ValueError( + 'No valid number words found! Please enter a valid number word' + ) + + if ( + clean_numbers.count('ribu') > 1 + or clean_numbers.count('juta') > 1 + or clean_numbers.count('bilion') > 1 + or clean_numbers.count('perpuluhan') > 1 + or clean_numbers.count('negatif') > 1 + or clean_numbers.count('seribu') > 1 + or clean_numbers.count('sejuta') > 1 + ): + raise ValueError( + 'Redundant number word! Please enter a valid number word' + ) + + negative = False + if clean_numbers[0] == 'negatif': + negative = True + clean_numbers = clean_numbers[1:] + + if clean_numbers.count('perpuluhan') == 1: + clean_decimal_numbers = clean_numbers[ + clean_numbers.index('perpuluhan') + 1 : + ] + clean_numbers = clean_numbers[: clean_numbers.index('perpuluhan')] + + billion_index = ( + clean_numbers.index('bilion') if 'bilion' in clean_numbers else -1 + ) + million_index = ( + clean_numbers.index('juta') if 'juta' in clean_numbers else -1 + ) + thousand_index = ( + clean_numbers.index('ribu') if 'ribu' in clean_numbers else -1 + ) + + if ( + thousand_index > -1 + and (thousand_index < million_index or thousand_index < billion_index) + ) or (million_index > -1 and million_index < billion_index): + raise ValueError('Malformed number! Please enter a valid number word') + + total_sum = 0 + + if len(clean_numbers) > 0: + if len(clean_numbers) == 1: + total_sum += malaysian_number_system[clean_numbers[0]] + else: + if billion_index > -1: + billion_multiplier = _number_formation( + clean_numbers[0:billion_index] + ) + total_sum += billion_multiplier * 1000000000 + + if million_index > -1: + if billion_index > -1: + million_multiplier = _number_formation( + clean_numbers[billion_index + 1 : million_index] + ) + else: + million_multiplier = _number_formation( + clean_numbers[0:million_index] + ) + total_sum += million_multiplier * 1000000 + + if thousand_index > -1: + if million_index > -1: + thousand_multiplier = _number_formation( + clean_numbers[million_index + 1 : thousand_index] + ) + elif billion_index > -1 and million_index == -1: + thousand_multiplier = _number_formation( + clean_numbers[billion_index + 1 : thousand_index] + ) + else: + thousand_multiplier = _number_formation( + clean_numbers[0:thousand_index] + ) + total_sum += thousand_multiplier * 1000 + + if thousand_index > -1 and thousand_index != len(clean_numbers) - 1: + hundreds = _number_formation( + clean_numbers[thousand_index + 1 :] + ) + elif million_index > -1 and million_index != len(clean_numbers) - 1: + hundreds = _number_formation(clean_numbers[million_index + 1 :]) + elif billion_index > -1 and billion_index != len(clean_numbers) - 1: + hundreds = _number_formation(clean_numbers[billion_index + 1 :]) + elif ( + thousand_index == -1 + and million_index == -1 + and billion_index == -1 + ): + hundreds = _number_formation(clean_numbers) + else: + hundreds = 0 + total_sum += hundreds + + if len(clean_decimal_numbers) > 0: + decimal_sum = _get_decimal_sum(clean_decimal_numbers) + total_sum += decimal_sum + + return total_sum * -1 if negative else total_sum diff --git a/setup-gpu.py b/setup-gpu.py index 6668a824..b33bbcf6 100644 --- a/setup-gpu.py +++ b/setup-gpu.py @@ -6,7 +6,7 @@ setuptools.setup( name = __packagename__, packages = setuptools.find_packages(), - version = '2.3.4', + version = '2.3.5', python_requires = '>=3.6.*', description = 'Natural-Language-Toolkit for bahasa Malaysia, powered by Deep Learning Tensorflow. GPU Version', author = 'huseinzol05', @@ -39,5 +39,4 @@ 'Operating System :: OS Independent', 'Topic :: Text Processing', ], - long_description = open('readme-pypi.rst').read(), ) diff --git a/setup.py b/setup.py index cf60bd42..d9cd6277 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name = __packagename__, packages = setuptools.find_packages(), - version = '2.3.4', + version = '2.3.5', python_requires = '>=3.6.*', description = 'Natural-Language-Toolkit for bahasa Malaysia, powered by Deep Learning Tensorflow.', author = 'huseinzol05', @@ -39,5 +39,4 @@ 'Operating System :: OS Independent', 'Topic :: Text Processing', ], - long_description = open('readme-pypi.rst').read(), )