diff --git a/README.md b/README.md index dc66114..6eaaf78 100644 --- a/README.md +++ b/README.md @@ -18,33 +18,57 @@ If you are interested in contributing to **vntk**, or just hacking on it, then f Jump to guide: [How to build an NLP API Server using Vntk](#nlp-api-server). +# Documentation + +* [**CLI Utilities**](#cli-utilities) + * [1. Installation](#1-installation) + * [2. Usage Example](#2-usage-example) +* [**API Usage**](#api-usage) + * [1. Tokenizer](#1-tokenizer) + * [2. Word Segmentation](#2-word-segmentation) + * [3. POS Tagging](#3-pos-tagging) + * [4. Chunking](#4-chunking) + * [5. Named Entity Recognition](#5-named-entity-recognition) + * [PER LOC ORG](#ner-per-loc-org) + * [Date time](#ner-date-time) + * [Custom NER](#ner-custom) + * [6. Utility](#6-utility) + * [Dictionary](#dictionary) + * [Clean html](#clean-html) + * [7. TF-IDF](#7-tf-idf) + * [8. Classifiers](#8-classifiers) + * [Naive Bayes](#bayes-classifier) + * [fastText](#fasttext-classifier) + * [9. Language identification](#9-language-identification) + * [10. CRFSuite](#10-crfsuite) +* [**NLP API Server**](#nlp-api-server) +* [**Contributing**](#contributing) +* [**License**](#license) + # CLI Utilities +## 1. Installation + Vntk cli will install nice and easy with: > npm install -g @vntk/cli -Then you need to pay attention how to use these cli utilities to preprocess text from files, especially vietnamese that describe at the end of each apis usage. If you wish to improve the tool, please fork and make it better [here](https://github.com/vntk/vntk-cli). +Then you need to pay attention to how to use these cli utilities to preprocess text from files, especially vietnamese that **describe at the end of each apis usage**. If you wish to improve the tool, please fork and make it better [here](https://github.com/vntk/vntk-cli). -# API Usage +## 2. Usage Example -* [1. Tokenizer](#1-tokenizer) -* [2. Word Segmentation](#2-word-segmentation) -* [3. POS Tagging](#3-pos-tagging) -* [4. Chunking](#4-chunking) -* [5. Named Entity Recognition](#5-named-entity-recognition) - * [PER LOC ORG](#ner-per-loc-org) - * [Date time](#ner-date-time) - * [Custom NER](#ner-custom) -* [6. Utility](#6-utility) - * [Dictionary](#dictionary) - * [Clean html](#clean-html) -* [7. TF-IDF](#7-tf-idf) -* [8. Classifiers](#8-classifiers) - * [Naive Bayes](#bayes-classifier) - * [fastText](#fasttext-classifier) -* [9. Language identification](#9-language-identification) -* [10. CRFSuite](#10-crfsuite) +After the CLI has installed, you need to open your `Terminal` (or Command Prompt on Windows) and type command you need to use. + +For instance, the following command will open a file and process it by using Word Tokenizer to tokenize each lines in the file. + +```bash +# Process a text file or a folder +$ vntk ws input.txt --output output.txt + +# Output file will contain lines which have tokenized. +``` + +# API Usage ## 1. Tokenizer @@ -68,8 +92,8 @@ Command line: `vntk tok ` ## 2. Word Segmentation -> Vietnamese Word Segmentation using Conditional Random Fields, called: `WordTokenizer`. -> WordTokenizer helps break text into arrays of words! +> Vietnamese Word Segmentation using Conditional Random Fields, called: `Word Tokenizer`. +> Word Tokenizer helps break text into arrays of words! ```js var vntk = require('vntk'); @@ -256,7 +280,7 @@ vntk clean [Term Frequency–Inverse Document Frequency (tf-idf)](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) is implemented to determine how important a word (or words) is to a document relative to a corpus. See following example. ```js -var vntk = require('./lib/vntk'); +var vntk = require('vntk'); var tfidf = new vntk.TfIdf(); tfidf.addDocument('Đại tướng Trần Đại Quang - Ủy viên Bộ Chính trị, Bí thư Đảng ủy Công an Trung ương, Bộ trưởng Bộ Công an.'); diff --git a/lib/vntk.js b/lib/vntk.js index 692f735..ac86094 100644 --- a/lib/vntk.js +++ b/lib/vntk.js @@ -11,17 +11,20 @@ const fs = require('fs') const util = require('util') // singleton instance -exports.util = () => require('./util'); +/** + * Regex Tokenizer + */ exports.tokenizer = () => require('./tokenizer'); /** * Word Segmentation + * It also is a Word Tokenizer which use a CRF model * @param {String} modelFileName new custom model */ exports.wordTokenizer = (modelFileName) => { - if(modelFileName && fs.existsSync(modelFileName)) { + if (modelFileName && fs.existsSync(modelFileName)) { return require('./word_tokenizer').newModel(modelFileName) - } else { + } else { return require('./word_tokenizer') } } @@ -31,9 +34,9 @@ exports.wordTokenizer = (modelFileName) => { * @param {String} modelFileName new custom model */ exports.posTag = (modelFileName) => { - if(modelFileName && fs.existsSync(modelFileName)) { + if (modelFileName && fs.existsSync(modelFileName)) { return require('./pos_tag').newModel(modelFileName) - } else { + } else { return require('./pos_tag') } } @@ -43,9 +46,9 @@ exports.posTag = (modelFileName) => { * @param {String} modelFileName new custom model */ exports.chunking = (modelFileName) => { - if(modelFileName && fs.existsSync(modelFileName)) { + if (modelFileName && fs.existsSync(modelFileName)) { return require('./chunking').newModel(modelFileName) - } else { + } else { return require('./chunking') } }; @@ -55,9 +58,9 @@ exports.chunking = (modelFileName) => { * @param {String} modelFileName new custom model */ exports.ner = (modelFileName) => { - if(modelFileName && fs.existsSync(modelFileName)) { + if (modelFileName && fs.existsSync(modelFileName)) { return require('./ner').newModel(modelFileName) - } else { + } else { return require('./ner') } }; @@ -67,9 +70,9 @@ exports.ner = (modelFileName) => { * @param {String} modelFileName new custom model */ exports.langid = (modelFileName) => { - if(modelFileName && fs.existsSync(modelFileName)) { + if (modelFileName && fs.existsSync(modelFileName)) { return require('./langid').newModel(modelFileName) - } else { + } else { return require('./langid') } }; @@ -79,9 +82,9 @@ exports.langid = (modelFileName) => { * @param {String} modelFileName path to new updated dictionary */ exports.dictionary = (modelFileName) => { - if(modelFileName && fs.existsSync(modelFileName)) { + if (modelFileName && fs.existsSync(modelFileName)) { return new require('@vntk/dictionary').Dictionary(modelFileName) - } else { + } else { return require('@vntk/dictionary') } } @@ -100,10 +103,23 @@ exports.BayesClassifier = require('./classifiers').BayesClassifier; exports.LogisticRegressionClassifier = require('./classifiers').LogisticRegressionClassifier; exports.FastTextClassifier = require('./classifiers').FastTextClassifier; +/** + * Utilities + */ +exports.util = () => require('./util'); + +/** + * Get a new logger + * @param {String} name + */ +exports.logger = (name) => { + return require('./logger')(name); +} + /** * Depreciated * Please use lower camelCase api with custom model. */ exports.Langid = util.deprecate(exports.langid, '`vntk.Langid()` is depreciated, please use `vntk.langid([custom_model])` instead.') exports.getDictionary = util.deprecate(exports.dictionary, '`vntk.getDictionary()` is depreciated, please use `vntk.dictionary([custom_model])` instead.') -exports.wordSent = util.deprecate(exports.wordTokenizer, '`vntk.wordSent()` is depreciated, please use `vntk.wordTokenizer([custom_model])` instead.') +exports.wordSent = util.deprecate(exports.wordTokenizer, '`vntk.wordSent()` is depreciated, please use `vntk.wordTokenizer([custom_model])` instead.') \ No newline at end of file diff --git a/package.json b/package.json index b196250..b5d3912 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "vntk", - "version": "1.4.0", + "version": "1.4.1", "description": "Vietnamese NLP Toolkit for Node", "main": "index.js", "bin": {