Skip to content

Commit

Permalink
Merge pull request #27 from vunb/v1
Browse files Browse the repository at this point in the history
add fastText classifier and language detection
  • Loading branch information
vunb authored Dec 11, 2017
2 parents 86db621 + 77182d3 commit 737a451
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ language: node_js

os:
- linux
- osx
#- osx

node_js:
- '4'
Expand Down
65 changes: 64 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ If you are interested in contributing to **vntk**, or just hacking on it, then f
* [6. Utility](#6-utility)
* [7. TF-IDF](#7-tf-idf)
* [8. Classifiers](#8-classifiers)
* [9. Language identification](#9-language-identification)

## 1. Tokenizer

Expand Down Expand Up @@ -199,7 +200,7 @@ document #3 is 9.242592351485516

## 8. Classifiers

[Naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier) is a classifier currently supported. [fastText](https://github.com/facebookresearch/fastText), will be added in the next release.
[Naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier), [fastText](https://github.com/facebookresearch/fastText) are classifiers currently supported.

The following examples use the **BayesClassifier** class:

Expand Down Expand Up @@ -229,6 +230,68 @@ console.log(classifier.classify('kẻ thù của luffy là ai?'));
// output: who
```

### FastText Classifier

According to [fasttext.cc](https://fasttext.cc/docs/en/supervised-tutorial.html). We have a simple classifier for executing prediction models about `cooking` from stackexchange questions:

```js
const path = require('path');
const vntk = require('vntk');

const model = path.resolve(__dirname, './model_cooking.bin');
const classifier = new vntk.FastTextClassifier(model);

classifier.predict('Why not put knives in the dishwasher?', 5, (err, res) => {
if (err) {
console.error(err);
} else if (res.length > 0) {
let tag = res[0].label; // __label__knives
let confidence = res[0].value // 0.8787146210670471
console.log('classify', tag, confidence, res);
} else {
console.log('No matches');
}
});
```

## 9. Language identification

**VNTK Langid** can identify 176 languages from text samples and return confidence scores for each (see the list of ISO codes below). This model was trained by [fastText](https://fasttext.cc/docs/en/language-identification.html) on data from Wikipedia, Tatoeba and SETimes, used under CC-BY-SA.

Api usage example:

* langid.detect([input])
* langid.getLanguages([input, num, callback])
* langid.langids - list of supported languages

```js
const langid = require('vntk').Langid;

// returns the most accuracy language detected
langid.detect('sử dụng vntk với fastext rất tuyệt?')
.then((lid) => {
console.log(lid)
// vi
});

// returns the list of detectable languages
langid.getLanguages('Wie lange bleiben Sie?', 5)
.then((res) => {
let lid = res[0].label;
t.equal(lid, 'de', 'German');
t.equal(res.length, 5, 'number of languagues are detected');
console.log(res)
});

// returns list of supported languagues
console.log(langid.langids)
```


List of supported languages

> af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
# Contributing

Pull requests and stars are highly welcome.
Expand Down
5 changes: 5 additions & 0 deletions lib/classifiers/fasttext.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
'use strict';
const path = require('path');
const fastText = require('fasttext');

module.exports = fastText.Classifier;
3 changes: 2 additions & 1 deletion lib/classifiers/index.js
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
exports.BayesClassifier = require('./bayes');
exports.LogisticRegressionClassifier = require('./logistic_regression');
exports.LogisticRegressionClassifier = require('./logistic_regression');
exports.FastTextClassifier = require('./fasttext');
50 changes: 50 additions & 0 deletions lib/langid/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
'use strict';
const path = require('path');
const FastTextClassifier = require('../classifiers/fasttext');
const logger = require('../logger')('Langid');

class Langid {

constructor() {

this.model_filename = path.resolve(__dirname, './lid.176.ftz');
this.classifier = new FastTextClassifier(this.model_filename);
logger.info(`load model ${this.model_filename} success!`);
}

get langids() {
let _langids = 'af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh';
return _langids.split(' ');
}

detect(input) {
return this.getLanguages(input, 1).then((res) => {
if (res && res.length > 0) {
return res[0].label;
} else {
return null;
}
})
}

getLanguages(input, num, callback) {
return new Promise((resolve, reject) => {
num = num || 1;
callback = callback || (() => 1);
this.classifier.predict(input, num, (err, res) => {
let lids = res.map((lid) => {
lid.label = lid.label.replace(/^__label__/, '');
lid.confidence = lid.value;
return lid;
});

callback(err, lids);
if (!err) resolve(lids)
else reject(err)
});
})
}

}

module.exports = new Langid();
Binary file added lib/langid/lid.176.ftz
Binary file not shown.
4 changes: 3 additions & 1 deletion lib/vntk.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@

exports.TfIdf = require('./tfidf');
exports.BayesClassifier = require('./classifiers').BayesClassifier;
exports.LogisticRegressionClassifier = require('./classifiers').LogisticRegressionClassifier;
exports.LogisticRegressionClassifier = require('./classifiers').LogisticRegressionClassifier;
exports.FastTextClassifier = require('./classifiers').FastTextClassifier;
exports.Langid = require('./langid');
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "vntk",
"version": "1.1.0",
"version": "1.2.0",
"description": "Vietnamese NLP Toolkit for Node",
"main": "index.js",
"bin": {
Expand Down Expand Up @@ -31,6 +31,7 @@
"commander": "2.9.0",
"crfsuite": "^0.9.3",
"debug": "^3.1.0",
"fasttext": "^0.1.0",
"lodash": "4.15.0",
"title-case": "^2.1.1"
},
Expand Down
20 changes: 20 additions & 0 deletions test/specs/langid.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
'use strict';
const test = require('tape');
const path = require('path');
const langid = require('../../lib/vntk').Langid;

test('vntk language identification', function (t) {
t.plan(3);

langid.detect('bạn ở đây trong bao lâu?')
.then((lid) => {
t.equal(lid, 'vi', 'Vietnamese');
});
langid.getLanguages('Wie lange bleiben Sie?', 5)
.then((res) => {
let lid = res[0].label;
t.equal(lid, 'de', 'German');
t.equal(res.length, 5, 'number of languagues are detected');
console.log(res)
});
})
1 change: 1 addition & 0 deletions test/start.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ var dir = '../test/specs/';
'ner',
'tfidf',
'bayes_classifier',
'langid',
].forEach((script) => {
require(path.join(dir, script));
});

0 comments on commit 737a451

Please sign in to comment.