From 95a2ee393a8491a6ea77aee91c5f9bf42eaddd60 Mon Sep 17 00:00:00 2001 From: Vunb Date: Fri, 11 May 2018 22:24:37 +0700 Subject: [PATCH 1/8] update api usage --- README.md | 8 +++++++- lib/langid/index.js | 12 ++++++++++-- lib/vntk.js | 32 ++++++++++++++++++++++++++++---- package.json | 2 +- test/specs/dictionary.js | 2 +- test/specs/langid.js | 2 +- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6072caf..cdacae1 100644 --- a/README.md +++ b/README.md @@ -348,7 +348,7 @@ Api usage example: * langid.langids - list of supported languages ```js -const langid = require('vntk').Langid(); +const langid = require('vntk').langid(); // returns the most accuracy language detected langid.detect('sử dụng vntk với fastext rất tuyệt?') @@ -370,6 +370,12 @@ langid.getLanguages('Wie lange bleiben Sie?', 5) console.log(langid.langids) ``` +Load custom trained model: + +```js +var vntk = require('vntk'); +var langid = vntk.langid(new_model_path); +``` List of supported languages diff --git a/lib/langid/index.js b/lib/langid/index.js index 6fa7652..d11fc27 100644 --- a/lib/langid/index.js +++ b/lib/langid/index.js @@ -5,10 +5,10 @@ const logger = require('../logger')('Langid'); class Langid { - constructor() { + constructor(fn) { try { - this.model_filename = path.resolve(__dirname, './lid.176.ftz'); + this.model_filename = fn || path.resolve(__dirname, './lid.176.ftz'); logger.info('load model: ' + this.model_filename) this.classifier = new FastTextClassifier(this.model_filename); logger.info(`load model ${this.model_filename} success!`); @@ -18,6 +18,14 @@ class Langid { } + /** + * Create new predictor from custom model + * @param {String} fn filename + */ + newModel(fn) { + return new Langid(fn); + } + get langids() { let _langids = 'af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh'; return _langids.split(' '); diff --git a/lib/vntk.js b/lib/vntk.js index adf9ee7..eaa74b0 100644 --- a/lib/vntk.js +++ b/lib/vntk.js @@ -8,6 +8,7 @@ 'use strict'; const fs = require('fs') +const util = require('util') // singleton instance exports.util = () => require('./util'); @@ -61,13 +62,36 @@ exports.ner = (modelFileName) => { } }; -// exports class +/** + * Langid - Language identification + * @param {String} modelFileName new custom model + */ +exports.langid = (modelFileName) => { + if(modelFileName && fs.existsSync(modelFileName)) { + return require('./langid').newModel(modelFileName) + } else { + return require('./langid') + } +}; +exports.dictionary = (modelFileName) => { + if(modelFileName && fs.existsSync(modelFileName)) { + return new require('@vntk/dictionary').Dictionary(modelFileName) + } else { + return require('@vntk/dictionary') + } +} + +// exports class +// Use with CamelCase convention. exports.TfIdf = require('./tfidf'); exports.BayesClassifier = require('./classifiers').BayesClassifier; exports.LogisticRegressionClassifier = require('./classifiers').LogisticRegressionClassifier; exports.FastTextClassifier = require('./classifiers').FastTextClassifier; -exports.Langid = () => require('./langid'); -// external components -exports.getDictionary = () => require('@vntk/dictionary'); \ No newline at end of file +/** + * Depreciated + * Please use lower camelCase api with custom model. + */ +exports.Langid = util.deprecate(exports.langid, '`vntk.Langid()` is depreciated, please use `vntk.langid([custom_model])` instead.') +exports.getDictionary = util.deprecate(exports.dictionary, '`vntk.getDictionary()` is depreciated, please use `vntk.dictionary([custom_model])` instead.') diff --git a/package.json b/package.json index 807739e..43764bf 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "title-case": "^2.1.1" }, "devDependencies": { - "nan": "^2.4.0", + "nan": "^2.10.0", "tap-spec": "^4.1.1", "tape": "^4.9.0" } diff --git a/test/specs/dictionary.js b/test/specs/dictionary.js index 6f66469..d2c22cf 100644 --- a/test/specs/dictionary.js +++ b/test/specs/dictionary.js @@ -1,7 +1,7 @@ 'use strict'; const test = require('tape'); const vntk = require('../../lib/vntk'); -const dictionary = vntk.getDictionary(); +const dictionary = vntk.dictionary(); test('vntk dictionary', function (t) { t.plan(1); diff --git a/test/specs/langid.js b/test/specs/langid.js index 92ec445..546b391 100644 --- a/test/specs/langid.js +++ b/test/specs/langid.js @@ -1,7 +1,7 @@ 'use strict'; const test = require('tape'); const path = require('path'); -const langid = require('../../lib/vntk').Langid(); +const langid = require('../../lib/vntk').langid(); test('vntk language identification', function (t) { t.plan(3); From 6bc7c89b5c0193c5519b1057ba80621e7b92f3ed Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 00:34:18 +0700 Subject: [PATCH 2/8] refactoring word tokenizer --- lib/classifiers/classifier.js | 2 +- lib/pos_tag/index.js | 9 ++++----- lib/vntk.js | 6 +++--- lib/{word_sent => word_tokenizer}/index.js | 8 ++++---- lib/{word_sent => word_tokenizer}/model.bin | Bin .../{word_sent.js => tokenizer/word_tokenizer.js} | 6 +++--- test/start.js | 2 +- 7 files changed, 16 insertions(+), 17 deletions(-) rename lib/{word_sent => word_tokenizer}/index.js (95%) rename lib/{word_sent => word_tokenizer}/model.bin (100%) rename test/specs/{word_sent.js => tokenizer/word_tokenizer.js} (87%) diff --git a/lib/classifiers/classifier.js b/lib/classifiers/classifier.js index 7869508..12fcc27 100644 --- a/lib/classifiers/classifier.js +++ b/lib/classifiers/classifier.js @@ -1,7 +1,7 @@ 'use strict'; const EventEmitter = require('events').EventEmitter; -const tagger = require('../word_sent'); +const tagger = require('../word_tokenizer'); const stopwords = require('../util/stopwords'); class Classifier extends EventEmitter { diff --git a/lib/pos_tag/index.js b/lib/pos_tag/index.js index 7aca974..90ca40e 100644 --- a/lib/pos_tag/index.js +++ b/lib/pos_tag/index.js @@ -1,9 +1,8 @@ 'use strict'; const path = require('path'); const crfsuite = require('crfsuite'); -const tokenizer = require('../tokenizer'); -const word_sent = require('../word_sent'); -const fe = require('../features'); +const tokenizer = require('../word_tokenizer'); +const features = require('../features'); const logger = require('../logger')('POSTag'); @@ -42,7 +41,7 @@ class POSTag { } tag(text) { - let words = word_sent.tag(text); + let words = tokenizer.tag(text); let tokens = words.map((token) => { return [token, 'X'] }); @@ -54,7 +53,7 @@ class POSTag { transform(tokens) { let template = this.template; - return tokens.map((token, i) => fe.word2features(tokens, i, template)); + return tokens.map((token, i) => features.word2features(tokens, i, template)); } } diff --git a/lib/vntk.js b/lib/vntk.js index eaa74b0..cf5dfe7 100644 --- a/lib/vntk.js +++ b/lib/vntk.js @@ -18,11 +18,11 @@ exports.tokenizer = () => require('./tokenizer'); * Word Segmentation * @param {String} modelFileName new custom model */ -exports.wordSent = (modelFileName) => { +exports.wordTokenizer = (modelFileName) => { if(modelFileName && fs.existsSync(modelFileName)) { - return require('./word_sent').newModel(modelFileName) + return require('./word_tokenizer').newModel(modelFileName) } else { - return require('./word_sent') + return require('./word_tokenizer') } } diff --git a/lib/word_sent/index.js b/lib/word_tokenizer/index.js similarity index 95% rename from lib/word_sent/index.js rename to lib/word_tokenizer/index.js index c41fa8d..8165b50 100644 --- a/lib/word_sent/index.js +++ b/lib/word_tokenizer/index.js @@ -4,9 +4,9 @@ const crfsuite = require('crfsuite'); const tokenizer = require('../tokenizer'); const fe = require('../features'); -const logger = require('../logger')('WordSent'); +const logger = require('../logger')('WordTokenizer'); -class WordSent { +class WordTokenizer { constructor(fn) { this.tagger = crfsuite.Tagger(); @@ -24,7 +24,7 @@ class WordSent { * @param {String} fn filename */ newModel(fn) { - return new WordSent(fn); + return new WordTokenizer(fn); } get template() { @@ -100,4 +100,4 @@ class WordSent { } } -module.exports = new WordSent(); \ No newline at end of file +module.exports = new WordTokenizer(); \ No newline at end of file diff --git a/lib/word_sent/model.bin b/lib/word_tokenizer/model.bin similarity index 100% rename from lib/word_sent/model.bin rename to lib/word_tokenizer/model.bin diff --git a/test/specs/word_sent.js b/test/specs/tokenizer/word_tokenizer.js similarity index 87% rename from test/specs/word_sent.js rename to test/specs/tokenizer/word_tokenizer.js index 56fa3cf..26deb17 100644 --- a/test/specs/word_sent.js +++ b/test/specs/tokenizer/word_tokenizer.js @@ -1,9 +1,9 @@ 'use strict'; var test = require('tape'), - vntk = require('../../lib/vntk'), - ws = vntk.wordSent(); + vntk = require('../../../lib/vntk'), + ws = vntk.wordTokenizer(); -test('word_sent simple case', function (t) { +test('wordTokenizer simple case', function (t) { t.plan(9); t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .'); diff --git a/test/start.js b/test/start.js index adc1b05..943ade5 100644 --- a/test/start.js +++ b/test/start.js @@ -8,7 +8,7 @@ var dir = '../test/specs/'; 'util', 'tokenizer', 'normalizer', - 'word_sent', + 'tokenizer/word_tokenizer', 'pos_tag', 'chunking', 'ner', From 384a57a06eda558a5be68211f4662e32e8af3b50 Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 00:43:06 +0700 Subject: [PATCH 3/8] mark `vntk.wordSent()` is depreciated --- README.md | 17 +++++++---------- lib/vntk.js | 1 + 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index cdacae1..c2a307b 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ If you are interested in contributing to **vntk**, or just hacking on it, then f ## 1. Tokenizer -> Word Tokenizer using Regular Expression. +> Regex Tokenizer using Regular Expression. > Tokenizer is provided to break text into arrays of tokens! Example: @@ -57,27 +57,24 @@ Command line: `vntk tok ` ## 2. Word Segmentation -> Vietnamese Word Segmentation using Conditional Random Fields, called: `word_sent`. -> Word_Sent helps break text into arrays of words! +> Vietnamese Word Segmentation using Conditional Random Fields, called: `Word Tokenizer`. +> wordTokenizer helps break text into arrays of words! ```js var vntk = require('vntk'); -var word_sent = vntk.wordSent(); +var tokenizer = vntk.wordTokenizer(); -console.log(word_sent.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội')); +console.log(tokenizer.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội')); // [ 'Chào mừng', 'các', 'bạn', 'trẻ', 'tới', 'thành phố', 'Hà Nội' ] - -console.log(word_sent.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội', 'text')); -// Chào_mừng các bạn trẻ tới thành_phố Hà_Nội ``` Load custom trained model: ```js var vntk = require('vntk'); -var word_sent = vntk.wordSent(new_model_path); +var tokenizer = vntk.wordTokenizer(new_model_path); -console.log(word_sent.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội', 'text')); +console.log(tokenizer.tag('Chào mừng các bạn trẻ tới thành phố Hà Nội', 'text')); // Chào_mừng các bạn trẻ tới thành_phố Hà_Nội ``` diff --git a/lib/vntk.js b/lib/vntk.js index cf5dfe7..a0bd32f 100644 --- a/lib/vntk.js +++ b/lib/vntk.js @@ -95,3 +95,4 @@ exports.FastTextClassifier = require('./classifiers').FastTextClassifier; */ exports.Langid = util.deprecate(exports.langid, '`vntk.Langid()` is depreciated, please use `vntk.langid([custom_model])` instead.') exports.getDictionary = util.deprecate(exports.dictionary, '`vntk.getDictionary()` is depreciated, please use `vntk.dictionary([custom_model])` instead.') +exports.wordSent = util.deprecate(exports.wordTokenizer, '`vntk.wordSent()` is depreciated, please use `vntk.wordTokenizer([custom_model])` instead.') From b66118411fd537350b013f05b6bd933ecd8a8b61 Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 00:49:29 +0700 Subject: [PATCH 4/8] fix npm version for building --- .travis.yml | 2 +- appveyor.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 58d9b81..0256256 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ node_js: before_install: # update npm to latest -- npm install -g npm@latest +- npm install -g npm@5.7.1 # show product version - nvm --version diff --git a/appveyor.yml b/appveyor.yml index 3544bd3..b6662ae 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,7 +18,7 @@ install: # Get the latest stable version of Node.js or io.js - ps: Install-Product node $env:nodejs_version # update npm to latest - - npm install -g npm@latest + - npm install -g npm@5.7.1 # install modules - npm install From 597221a478069065740c6659a98c995b656a0fa9 Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 01:55:54 +0700 Subject: [PATCH 5/8] add text format for chunking, pos tagging, ner --- README.md | 13 ++++++++----- lib/chunking/index.js | 39 +++++++++++++++++++++++++++++++++++++-- lib/ner/index.js | 41 +++++++++++++++++++++++++++++++++++++++-- lib/pos_tag/index.js | 10 ++++++++-- test/specs/chunking.js | 11 +++++++++++ test/specs/ner.js | 14 +++++++++++++- test/specs/pos_tag.js | 11 +++++++++++ 7 files changed, 127 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c2a307b..ce3cb32 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,8 @@ Load custom trained model: var vntk = require('vntk'); var pos_tag = vntk.posTag(new_model_path); -console.log(pos_tag.tag('Chợ thịt chó nổi tiếng ở TP Hồ Chí Minh bị truy quét')) +console.log(pos_tag.tag('Cán bộ xã và những chiêu "xin làm hộ nghèo" cười ra nước mắt', 'text')) +// [N Cán bộ] [N xã] [C và] [L những] [N chiêu] [CH "] [V xin] [V làm] [N hộ] [A nghèo] [CH "] [V cười] [V ra] [N nước mắt] ``` Command line: `vntk pos ` @@ -146,7 +147,8 @@ Load custom trained model: var vntk = require('vntk'); var chunking = vntk.chunking(new_model_path); -console.log(chunking.tag('Nhật ký SEA Games ngày 21/8: Ánh Viên thắng giòn giã ở vòng loại.')); +console.log(chunking.tag('Nhật ký SEA Games ngày 21/8: Ánh Viên thắng giòn giã ở vòng loại.', 'text')); +// [NP Nhật ký] [NP SEA] [NP Games] [NP ngày] [NP 21/8] : [NP Ánh Viên] [VP thắng] [NP giòn giã] [PP ở] [NP vòng] [NP loại] . ``` Command line: `vntk chunk ` @@ -179,7 +181,8 @@ Load custom trained model: var vntk = require('vntk'); var ner = vntk.ner(new_model_path); -console.log(ner.tag('Chưa tiết lộ lịch trình tới Việt Nam của Tổng thống Mỹ Donald Trump')) +console.log(ner.tag('Chưa tiết lộ lịch trình tới Việt Nam của Tổng thống Mỹ Donald Trump', 'text')) +// Chưa tiết lộ lịch trình tới [LOC Việt Nam] của Tổng thống [LOC Mỹ] [PER Donald Trump] ``` Command line: `vntk ner ` @@ -192,7 +195,7 @@ Command line: `vntk ner ` ```js var vntk = require('vntk'); -var dictionary = vntk.getDictionary(); +var dictionary = vntk.dictionary(); dictionary.has('chào'); // true @@ -202,7 +205,7 @@ dictionary.has('chào'); ```js var vntk = require('vntk'); -var dictionary = vntk.getDictionary(); +var dictionary = vntk.dictionary(); var senses = dictionary.lookup('chào'); console.log(senses); diff --git a/lib/chunking/index.js b/lib/chunking/index.js index ab01bfb..444322d 100644 --- a/lib/chunking/index.js +++ b/lib/chunking/index.js @@ -1,4 +1,5 @@ 'use strict'; +const _ = require('lodash'); const path = require('path'); const crfsuite = require('crfsuite'); const tokenizer = require('../tokenizer'); @@ -43,7 +44,40 @@ class Chucking { ]; } - tag(text) { + format(tokens) { + var prev_prefix_label = '' + var result = _.reduce(tokens, (out, tok) => { + if (tok[2] === 'O') { + if (prev_prefix_label !== '') { + prev_prefix_label = ''; + return out + '] ' + tok[0] + ' '; + } else { + prev_prefix_label = ''; + return out + ' ' + tok[0] + ' '; + } + } else { + var tmp = tok[2].split('-'); + if (tmp[0] === 'B') { + // kiểm tra thẻ đằng trước là B thì cần đóng thẻ trước + // và để mở thẻ hiện tại + if (prev_prefix_label !== '') { + prev_prefix_label = tmp[0]; + return out + `] [${tmp[1]} ${tok[0]}`; + } else { + prev_prefix_label = tmp[0]; + return out + `[${tmp[1]} ${tok[0]}`; + } + } else { + prev_prefix_label = tmp[0]; + return out + ` ${tok[0]}`; + } + } + }, ' ') + if (prev_prefix_label !== '') result += ']'; + return result; + } + + tag(text, isFormat) { let pos_tags = pos_tag.tag(text); let tokens = pos_tags.map((tags) => { return [tags[0], tags[1], 'X'] @@ -51,7 +85,8 @@ class Chucking { let x = this.transform(tokens); let tags = this.tagger.tag(x); - return pos_tags.map((pos_tags, index) => [pos_tags[0], pos_tags[1], tags[index]]); + let result = pos_tags.map((pos_tags, index) => [pos_tags[0], pos_tags[1], tags[index]]); + return !isFormat ? result : this.format(result).trim(); } transform(tokens) { diff --git a/lib/ner/index.js b/lib/ner/index.js index 5f91f38..9806c53 100644 --- a/lib/ner/index.js +++ b/lib/ner/index.js @@ -1,4 +1,5 @@ 'use strict'; +const _ = require('lodash'); const path = require('path'); const crfsuite = require('crfsuite'); const tokenizer = require('../tokenizer'); @@ -43,7 +44,40 @@ class NER { ]; } - tag(text) { + format(tokens) { + var prev_prefix_label = '' + var result = _.reduce(tokens, (out, tok) => { + if (tok[3] === 'O') { + if (prev_prefix_label !== '') { + prev_prefix_label = ''; + return out + '] ' + tok[0] + ' '; + } else { + prev_prefix_label = ''; + return out + ' ' + tok[0] + ' '; + } + } else { + var tmp = tok[3].split('-'); + if (tmp[0] === 'B') { + // kiểm tra thẻ đằng trước là B thì cần đóng thẻ trước + // và để mở thẻ hiện tại + if (prev_prefix_label !== '') { + prev_prefix_label = tmp[0]; + return out + `] [${tmp[1]} ${tok[0]}`; + } else { + prev_prefix_label = tmp[0]; + return out + `[${tmp[1]} ${tok[0]}`; + } + } else { + prev_prefix_label = tmp[0]; + return out + ` ${tok[0]}`; + } + } + }, ' ') + if (prev_prefix_label !== '') result += ']'; + return result; + } + + tag(text, isFormat) { let chunk_tags = chunking.tag(text); let tokens = chunk_tags.map((tags) => { return [tags[0], tags[1], tags[2], 'X'] @@ -51,7 +85,10 @@ class NER { let x = this.transform(tokens); let tags = this.tagger.tag(x); - return chunk_tags.map((chunk_tags, index) => [chunk_tags[0], chunk_tags[1], chunk_tags[2], tags[index]]); + let result = chunk_tags.map((chunk_tags, index) => [chunk_tags[0], chunk_tags[1], chunk_tags[2], tags[index]]); + + return !isFormat ? result : this.format(result).trim(); + } transform(tokens) { diff --git a/lib/pos_tag/index.js b/lib/pos_tag/index.js index 90ca40e..3fdf728 100644 --- a/lib/pos_tag/index.js +++ b/lib/pos_tag/index.js @@ -1,4 +1,5 @@ 'use strict'; +const _ = require('lodash'); const path = require('path'); const crfsuite = require('crfsuite'); const tokenizer = require('../word_tokenizer'); @@ -40,7 +41,11 @@ class POSTag { ]; } - tag(text) { + format(tokens) { + return _.reduce(tokens, (out, tok) => out + `[${tok[1]} ${tok[0]}] `, '') + } + + tag(text, isFormat) { let words = tokenizer.tag(text); let tokens = words.map((token) => { return [token, 'X'] @@ -48,7 +53,8 @@ class POSTag { let x = this.transform(tokens); let tags = this.tagger.tag(x); - return words.map((word, index) => [word, tags[index]]); + let result = words.map((word, index) => [word, tags[index]]); + return !isFormat? result: this.format(result).trimRight(); } transform(tokens) { diff --git a/test/specs/chunking.js b/test/specs/chunking.js index 29ea896..8cca18e 100644 --- a/test/specs/chunking.js +++ b/test/specs/chunking.js @@ -27,4 +27,15 @@ test('chunking simple case', function (t) { t.deepEqual(chunking.tag(''), [], 'empty string'); t.deepEqual(tags, expected, text); +}); + +test('chucking format text', function (t) { + t.plan(1); + + let text = 'Nhật ký SEA Games ngày 21/8: Ánh Viên thắng giòn giã ở vòng loại.'; + let expected = '[NP Nhật ký] [NP SEA] [NP Games] [NP ngày] [NP 21/8] : [NP Ánh Viên] [VP thắng] [NP giòn giã] [PP ở] [NP vòng] [NP loại] .'; + + let result = chunking.tag(text, 'text'); + + t.equal(result, expected, expected); }); \ No newline at end of file diff --git a/test/specs/ner.js b/test/specs/ner.js index 693f75f..1fc9409 100644 --- a/test/specs/ner.js +++ b/test/specs/ner.js @@ -42,4 +42,16 @@ test('load custom model from file (2)', function (t) { t.deepEqual(newNER.tag(''), [], 'empty string'); t.deepEqual(tags[6][3], 'B-PER', 'B-PER from new model'); t.deepEqual(tags[7][3], 'I-PER', 'I-PER from new model'); -}); \ No newline at end of file +}); + +test('chucking format text', function (t) { + t.plan(1); + + let text = 'Chưa tiết lộ lịch trình tới Việt Nam của Tổng thống Mỹ Donald Trump'; + let expected = 'Chưa tiết lộ lịch trình tới [LOC Việt Nam] của Tổng thống [LOC Mỹ] [PER Donald Trump]'; + + let result = ner.tag(text, 'text'); + + t.equal(result, expected, expected); +}); + diff --git a/test/specs/pos_tag.js b/test/specs/pos_tag.js index 49e17ce..9cc1e9b 100644 --- a/test/specs/pos_tag.js +++ b/test/specs/pos_tag.js @@ -27,4 +27,15 @@ test('pos_tag simple case', function (t) { t.deepEqual(pos.tag(''), [], 'empty string'); t.deepEqual(tags, expected, text); +}); + +test('pos_tag format text', function (t) { + t.plan(1); + + let text = 'Cán bộ xã và những chiêu "xin làm hộ nghèo" cười ra nước mắt'; + let expected = '[N Cán bộ] [N xã] [C và] [L những] [N chiêu] [CH "] [V xin] [V làm] [N hộ] [A nghèo] [CH "] [V cười] [V ra] [N nước mắt]'; + + let result = pos.tag(text, 'text'); + + t.equal(result, expected, expected); }); \ No newline at end of file From e2b8bc98be4e235bcf2e27069efd40ca111af815 Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 01:58:07 +0700 Subject: [PATCH 6/8] format typo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ce3cb32..d46394c 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,8 @@ Command line: `vntk tok ` ## 2. Word Segmentation -> Vietnamese Word Segmentation using Conditional Random Fields, called: `Word Tokenizer`. -> wordTokenizer helps break text into arrays of words! +> Vietnamese Word Segmentation using Conditional Random Fields, called: `WordTokenizer`. +> WordTokenizer helps break text into arrays of words! ```js var vntk = require('vntk'); From 9e41ae31e3c6d7c92b702a0f5532cb8752bf568f Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 01:59:15 +0700 Subject: [PATCH 7/8] cut the release version 1.3.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 43764bf..f296dbc 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "vntk", - "version": "1.2.1", + "version": "1.3.0", "description": "Vietnamese NLP Toolkit for Node", "main": "index.js", "bin": { From e9985cb16bf6baa54c305e857030cddcf554faff Mon Sep 17 00:00:00 2001 From: Vunb Date: Sat, 12 May 2018 02:24:06 +0700 Subject: [PATCH 8/8] select npm files --- package.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index f296dbc..f8599f9 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "vntk": "./bin/vntk.js" }, "scripts": { + "start": "node server/app.js", "test": "tape test/start.js | tap-spec" }, "repository": { @@ -39,5 +40,11 @@ "nan": "^2.10.0", "tap-spec": "^4.1.1", "tape": "^4.9.0" - } + }, + "files": [ + "bin", + "lib", + "index.js", + "README.md" + ] }