Skip to content

Commit

Permalink
Merge pull request #40 from vunb/dev
Browse files Browse the repository at this point in the history
expose api get vntk logger
  • Loading branch information
vunb authored Jun 2, 2018
2 parents 4254239 + 74066b3 commit e49d71c
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 37 deletions.
68 changes: 46 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,57 @@ If you are interested in contributing to **vntk**, or just hacking on it, then f

Jump to guide: [How to build an NLP API Server using Vntk](#nlp-api-server).

# Documentation

* [**CLI Utilities**](#cli-utilities)
* [1. Installation](#1-installation)
* [2. Usage Example](#2-usage-example)
* [**API Usage**](#api-usage)
* [1. Tokenizer](#1-tokenizer)
* [2. Word Segmentation](#2-word-segmentation)
* [3. POS Tagging](#3-pos-tagging)
* [4. Chunking](#4-chunking)
* [5. Named Entity Recognition](#5-named-entity-recognition)
* [PER LOC ORG](#ner-per-loc-org)
* [Date time](#ner-date-time)
* [Custom NER](#ner-custom)
* [6. Utility](#6-utility)
* [Dictionary](#dictionary)
* [Clean html](#clean-html)
* [7. TF-IDF](#7-tf-idf)
* [8. Classifiers](#8-classifiers)
* [Naive Bayes](#bayes-classifier)
* [fastText](#fasttext-classifier)
* [9. Language identification](#9-language-identification)
* [10. CRFSuite](#10-crfsuite)
* [**NLP API Server**](#nlp-api-server)
* [**Contributing**](#contributing)
* [**License**](#license)

# CLI Utilities

## 1. Installation

Vntk cli will install nice and easy with:

> npm install -g @vntk/cli
Then you need to pay attention how to use these cli utilities to preprocess text from files, especially vietnamese that describe at the end of each apis usage. If you wish to improve the tool, please fork and make it better [here](https://github.com/vntk/vntk-cli).
Then you need to pay attention to how to use these cli utilities to preprocess text from files, especially vietnamese that **describe at the end of each apis usage**. If you wish to improve the tool, please fork and make it better [here](https://github.com/vntk/vntk-cli).

# API Usage
## 2. Usage Example

* [1. Tokenizer](#1-tokenizer)
* [2. Word Segmentation](#2-word-segmentation)
* [3. POS Tagging](#3-pos-tagging)
* [4. Chunking](#4-chunking)
* [5. Named Entity Recognition](#5-named-entity-recognition)
* [PER LOC ORG](#ner-per-loc-org)
* [Date time](#ner-date-time)
* [Custom NER](#ner-custom)
* [6. Utility](#6-utility)
* [Dictionary](#dictionary)
* [Clean html](#clean-html)
* [7. TF-IDF](#7-tf-idf)
* [8. Classifiers](#8-classifiers)
* [Naive Bayes](#bayes-classifier)
* [fastText](#fasttext-classifier)
* [9. Language identification](#9-language-identification)
* [10. CRFSuite](#10-crfsuite)
After the CLI has installed, you need to open your `Terminal` (or Command Prompt on Windows) and type command you need to use.

For instance, the following command will open a file and process it by using Word Tokenizer to tokenize each lines in the file.

```bash
# Process a text file or a folder
$ vntk ws input.txt --output output.txt

# Output file will contain lines which have tokenized.
```

# API Usage

## 1. Tokenizer

Expand All @@ -68,8 +92,8 @@ Command line: `vntk tok <file_name.txt>`

## 2. Word Segmentation

> Vietnamese Word Segmentation using Conditional Random Fields, called: `WordTokenizer`.
> WordTokenizer helps break text into arrays of words!
> Vietnamese Word Segmentation using Conditional Random Fields, called: `Word Tokenizer`.
> Word Tokenizer helps break text into arrays of words!
```js
var vntk = require('vntk');
Expand Down Expand Up @@ -256,7 +280,7 @@ vntk clean <file_name1.txt>
[Term Frequency–Inverse Document Frequency (tf-idf)](http://en.wikipedia.org/wiki/Tf%E2%80%93idf) is implemented to determine how important a word (or words) is to a document relative to a corpus. See following example.

```js
var vntk = require('./lib/vntk');
var vntk = require('vntk');
var tfidf = new vntk.TfIdf();

tfidf.addDocument('Đại tướng Trần Đại Quang - Ủy viên Bộ Chính trị, Bí thư Đảng ủy Công an Trung ương, Bộ trưởng Bộ Công an.');
Expand Down
44 changes: 30 additions & 14 deletions lib/vntk.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@ const fs = require('fs')
const util = require('util')
// singleton instance

exports.util = () => require('./util');
/**
* Regex Tokenizer
*/
exports.tokenizer = () => require('./tokenizer');

/**
* Word Segmentation
* It also is a Word Tokenizer which use a CRF model
* @param {String} modelFileName new custom model
*/
exports.wordTokenizer = (modelFileName) => {
if(modelFileName && fs.existsSync(modelFileName)) {
if (modelFileName && fs.existsSync(modelFileName)) {
return require('./word_tokenizer').newModel(modelFileName)
} else {
} else {
return require('./word_tokenizer')
}
}
Expand All @@ -31,9 +34,9 @@ exports.wordTokenizer = (modelFileName) => {
* @param {String} modelFileName new custom model
*/
exports.posTag = (modelFileName) => {
if(modelFileName && fs.existsSync(modelFileName)) {
if (modelFileName && fs.existsSync(modelFileName)) {
return require('./pos_tag').newModel(modelFileName)
} else {
} else {
return require('./pos_tag')
}
}
Expand All @@ -43,9 +46,9 @@ exports.posTag = (modelFileName) => {
* @param {String} modelFileName new custom model
*/
exports.chunking = (modelFileName) => {
if(modelFileName && fs.existsSync(modelFileName)) {
if (modelFileName && fs.existsSync(modelFileName)) {
return require('./chunking').newModel(modelFileName)
} else {
} else {
return require('./chunking')
}
};
Expand All @@ -55,9 +58,9 @@ exports.chunking = (modelFileName) => {
* @param {String} modelFileName new custom model
*/
exports.ner = (modelFileName) => {
if(modelFileName && fs.existsSync(modelFileName)) {
if (modelFileName && fs.existsSync(modelFileName)) {
return require('./ner').newModel(modelFileName)
} else {
} else {
return require('./ner')
}
};
Expand All @@ -67,9 +70,9 @@ exports.ner = (modelFileName) => {
* @param {String} modelFileName new custom model
*/
exports.langid = (modelFileName) => {
if(modelFileName && fs.existsSync(modelFileName)) {
if (modelFileName && fs.existsSync(modelFileName)) {
return require('./langid').newModel(modelFileName)
} else {
} else {
return require('./langid')
}
};
Expand All @@ -79,9 +82,9 @@ exports.langid = (modelFileName) => {
* @param {String} modelFileName path to new updated dictionary
*/
exports.dictionary = (modelFileName) => {
if(modelFileName && fs.existsSync(modelFileName)) {
if (modelFileName && fs.existsSync(modelFileName)) {
return new require('@vntk/dictionary').Dictionary(modelFileName)
} else {
} else {
return require('@vntk/dictionary')
}
}
Expand All @@ -100,10 +103,23 @@ exports.BayesClassifier = require('./classifiers').BayesClassifier;
exports.LogisticRegressionClassifier = require('./classifiers').LogisticRegressionClassifier;
exports.FastTextClassifier = require('./classifiers').FastTextClassifier;

/**
* Utilities
*/
exports.util = () => require('./util');

/**
* Get a new logger
* @param {String} name
*/
exports.logger = (name) => {
return require('./logger')(name);
}

/**
* Depreciated
* Please use lower camelCase api with custom model.
*/
exports.Langid = util.deprecate(exports.langid, '`vntk.Langid()` is depreciated, please use `vntk.langid([custom_model])` instead.')
exports.getDictionary = util.deprecate(exports.dictionary, '`vntk.getDictionary()` is depreciated, please use `vntk.dictionary([custom_model])` instead.')
exports.wordSent = util.deprecate(exports.wordTokenizer, '`vntk.wordSent()` is depreciated, please use `vntk.wordTokenizer([custom_model])` instead.')
exports.wordSent = util.deprecate(exports.wordTokenizer, '`vntk.wordSent()` is depreciated, please use `vntk.wordTokenizer([custom_model])` instead.')
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "vntk",
"version": "1.4.0",
"version": "1.4.1",
"description": "Vietnamese NLP Toolkit for Node",
"main": "index.js",
"bin": {
Expand Down

0 comments on commit e49d71c

Please sign in to comment.