-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add fastText classifier and language detection
- Loading branch information
Showing
10 changed files
with
148 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ language: node_js | |
|
||
os: | ||
- linux | ||
- osx | ||
#- osx | ||
|
||
node_js: | ||
- '4' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
'use strict'; | ||
const path = require('path'); | ||
const fastText = require('fasttext'); | ||
|
||
module.exports = fastText.Classifier; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
exports.BayesClassifier = require('./bayes'); | ||
exports.LogisticRegressionClassifier = require('./logistic_regression'); | ||
exports.LogisticRegressionClassifier = require('./logistic_regression'); | ||
exports.FastTextClassifier = require('./fasttext'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
'use strict'; | ||
const path = require('path'); | ||
const FastTextClassifier = require('../classifiers/fasttext'); | ||
const logger = require('../logger')('Langid'); | ||
|
||
class Langid { | ||
|
||
constructor() { | ||
|
||
this.model_filename = path.resolve(__dirname, './lid.176.ftz'); | ||
this.classifier = new FastTextClassifier(this.model_filename); | ||
logger.info(`load model ${this.model_filename} success!`); | ||
} | ||
|
||
get langids() { | ||
let _langids = 'af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh'; | ||
return _langids.split(' '); | ||
} | ||
|
||
detect(input) { | ||
return this.getLanguages(input, 1).then((res) => { | ||
if (res && res.length > 0) { | ||
return res[0].label; | ||
} else { | ||
return null; | ||
} | ||
}) | ||
} | ||
|
||
getLanguages(input, num, callback) { | ||
return new Promise((resolve, reject) => { | ||
num = num || 1; | ||
callback = callback || (() => 1); | ||
this.classifier.predict(input, num, (err, res) => { | ||
let lids = res.map((lid) => { | ||
lid.label = lid.label.replace(/^__label__/, ''); | ||
lid.confidence = lid.value; | ||
return lid; | ||
}); | ||
|
||
callback(err, lids); | ||
if (!err) resolve(lids) | ||
else reject(err) | ||
}); | ||
}) | ||
} | ||
|
||
} | ||
|
||
module.exports = new Langid(); |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
'use strict'; | ||
const test = require('tape'); | ||
const path = require('path'); | ||
const langid = require('../../lib/vntk').Langid; | ||
|
||
test('vntk language identification', function (t) { | ||
t.plan(3); | ||
|
||
langid.detect('bạn ở đây trong bao lâu?') | ||
.then((lid) => { | ||
t.equal(lid, 'vi', 'Vietnamese'); | ||
}); | ||
langid.getLanguages('Wie lange bleiben Sie?', 5) | ||
.then((res) => { | ||
let lid = res[0].label; | ||
t.equal(lid, 'de', 'German'); | ||
t.equal(res.length, 5, 'number of languagues are detected'); | ||
console.log(res) | ||
}); | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters