-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkama.cpp
109 lines (92 loc) · 2.81 KB
/
kama.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include "kama.h"
#include <iostream>
#include <string.h>
#include <stdlib.h>
namespace Kama{
Node Tagger::parseMeCabNode(const MeCab::Node* node, unsigned int nbest){
Node tNode;
tNode.surface = NULL;
if(node->length == 0){
return tNode;
}
tNode.surface = (char*)malloc(sizeof(char)*(node->length+1));
strncpy(tNode.surface, node->surface, node->length);
tNode.surface[node->length] = '\0';
if(isStopword(tNode.surface)){
free(tNode.surface);
return tNode;
}
tNode.feature = (char*)(node->feature);
tNode.nbest = nbest;
tNode.id = node->id;
return tNode;
}
std::vector<Kama::Node>* Tagger::parse(const char* str){
return parse(str, 1);
}
std::vector<Kama::Node>* Tagger::parse(const char* str, unsigned int nbest){
if(str == NULL){
return NULL;
}
std::vector<Kama::Node> *nodeVector = new std::vector<Kama::Node>();
this->mcLattice->set_request_type(MECAB_NBEST);
this->mcLattice->set_sentence(str);
this->mcTagger->parse(this->mcLattice);
unsigned int tNbest = 1;
for(;tNbest <= nbest;){
if(!(this->mcLattice->next())){
break;
}
MeCab::Node* node = this->mcLattice->bos_node();
CHECK_ERROR(node);
for(; node; node = node->next) {
if(node->stat == MECAB_EOS_NODE){
tNbest++;
continue;
}
Kama::Node tNode = parseMeCabNode(node, tNbest);
if(tNode.surface == NULL){
continue;
}
nodeVector->push_back(tNode);
}
}
return nodeVector;
}
bool Tagger::isStopword(const char* word){
bool ret = this->stopwordList.find((std::string)word) != this->stopwordList.end();
return ret;
}
void Tagger::printStopword(){
for(std::set<std::string>::const_iterator i = this->stopwordList.begin(); i != this->stopwordList.end(); i++){
std::cout << "prn" << "(" << (*i).length() << ") : "<< *i << std::endl;
}
}
void Tagger::setStopwordList(std::list<const char*> stopwordList){
clearStopwordList();
for(std::list<const char*>::const_iterator i = stopwordList.begin(); i != stopwordList.end(); i++){
this->stopwordList.insert((std::string)(*i));
}
return;
}
void Tagger::clearStopwordList(){
for(std::set<std::string>::const_iterator i = this->stopwordList.begin(); i != this->stopwordList.end(); i++){
this->stopwordList.erase(i);
}
this->stopwordList.clear();
}
void Tagger::addStopword(const char* stopword){
this->stopwordList.insert((std::string)(stopword));
return;
}
}
int main(int argc, char** argv)
{
Kama::Tagger* tagger = new Kama::Tagger("/usr/local/lib/mecab/dic/mecab-ko-dic");
std::vector<Kama::Node> *nodeVector = tagger->parse((const char*)(argv[1]));
for(std::vector<Kama::Node>::const_iterator i = nodeVector->begin(); i != nodeVector->end(); i++){
std::cout << "[" << i->nbest << "] : " << i->id << ' ' << i->surface << ' ' << i->feature << std::endl;
}
delete tagger;
return 0;
}