Skip to content

Commit

Permalink
close #4, now we don't rely anymore on ligmm++, so should compile fin…
Browse files Browse the repository at this point in the history
…e on most architecture
  • Loading branch information
Allan Simon committed Apr 7, 2014
1 parent 588c4d0 commit fa634b2
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 49 deletions.
7 changes: 0 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@ cmake_minimum_required(VERSION 2.6)

project(sinoparserd)
include_directories(
/usr/lib/i386-linux-gnu/glib-2.0/include/
/usr/include/glibmm-2.4
/usr/lib/glib-2.0/include/
/usr/lib/glibmm-2.4/include
/usr/include/glib-2.0
/usr/lib/i386-linux-gnu/glibmm-2.4/include/
src
)

Expand All @@ -27,5 +21,4 @@ target_link_libraries(
stdc++
expat
event
glibmm-2.4
)
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ The data files are provided only as example
### Requirement ###

* libexpat
* libgmm++
* libevent

### Usage ###
Expand Down
93 changes: 53 additions & 40 deletions src/Parser.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#ifndef SINOPARSER_PARSER_H
#define SINOPARSER_PARSER_H

#include <glibmm/ustring.h>
#include <vector>
#include <map>
#include <algorithm>
#include "Index.h"
#include "Database.h"
#include "Utf8String.h"

#define TRADITIONAL_SCRIPT true
#define SIMPLIFIED_SCRIPT false
Expand All @@ -14,16 +15,16 @@
template <typename T>
class Parser{
private:
Glib::ustring text_to_parse;
std::vector<Glib::ustring> segments;
Utf8String text_to_parse;
std::vector<std::string> segments;
std::vector<T*> items;
std::map<Glib::ustring, Glib::ustring> convertChinese2Latin;
std::map<Glib::ustring, Glib::ustring> convertLatin2Chinese;
std::map<std::string, std::string> convertChinese2Latin;
std::map<std::string, std::string> convertLatin2Chinese;

inline void init_maps();

Glib::ustring convert_trash_segment(Glib::ustring segment, bool toLatin);
Glib::ustring convert_trash_char(Glib::ustring trashChar, bool toLatin);
std::string convert_trash_segment(std::string segment, bool toLatin);
std::string convert_trash_char(std::string trashChar, bool toLatin);


inline std::string romanize_segment(int segmentNbr);
Expand All @@ -32,7 +33,7 @@ class Parser{

public:
Parser();
Parser(Glib::ustring text);
Parser(Utf8String text);
Parser(char* text);

void change_text(char* text);
Expand All @@ -42,7 +43,7 @@ class Parser{
std::string trad();
std::string simp();
std::string change_script();
std::vector<Glib::ustring> get_segments();
std::vector<std::string> get_segments();

bool guess_script();
void parse_against_index(Index<T>& index);
Expand Down Expand Up @@ -72,7 +73,7 @@ Parser<T>::Parser() {
*
*/
template <typename T>
Parser<T>::Parser(Glib::ustring text) {
Parser<T>::Parser(Utf8String text) {
text_to_parse = text;
init_maps();
}
Expand All @@ -82,7 +83,7 @@ Parser<T>::Parser(Glib::ustring text) {
*/
template <typename T>
Parser<T>::Parser(char* text) {
text_to_parse = Glib::ustring(text);
text_to_parse = Utf8String(text);
init_maps();

}
Expand All @@ -92,24 +93,24 @@ Parser<T>::Parser(char* text) {
*/
template <typename T>
inline void Parser<T>::init_maps() {
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("","."));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("",","));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("","?"));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("",","));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("","!"));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("",";"));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("",":"));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("","'"));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("","\""));
convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("","\""));

convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(".",""));
convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>("?",""));
convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(",",""));
convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>("!",""));
convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(";",""));
convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(":",""));
convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>("\"",""));
convertChinese2Latin.insert(std::pair<std::string, std::string>("","."));
convertChinese2Latin.insert(std::pair<std::string, std::string>("",","));
convertChinese2Latin.insert(std::pair<std::string, std::string>("","?"));
convertChinese2Latin.insert(std::pair<std::string, std::string>("",","));
convertChinese2Latin.insert(std::pair<std::string, std::string>("","!"));
convertChinese2Latin.insert(std::pair<std::string, std::string>("",";"));
convertChinese2Latin.insert(std::pair<std::string, std::string>("",":"));
convertChinese2Latin.insert(std::pair<std::string, std::string>("","'"));
convertChinese2Latin.insert(std::pair<std::string, std::string>("","\""));
convertChinese2Latin.insert(std::pair<std::string, std::string>("","\""));

convertLatin2Chinese.insert(std::pair<std::string, std::string>(".",""));
convertLatin2Chinese.insert(std::pair<std::string, std::string>("?",""));
convertLatin2Chinese.insert(std::pair<std::string, std::string>(",",""));
convertLatin2Chinese.insert(std::pair<std::string, std::string>("!",""));
convertLatin2Chinese.insert(std::pair<std::string, std::string>(";",""));
convertLatin2Chinese.insert(std::pair<std::string, std::string>(":",""));
convertLatin2Chinese.insert(std::pair<std::string, std::string>("\"",""));


}
Expand All @@ -120,7 +121,7 @@ inline void Parser<T>::init_maps() {

template <typename T>
void Parser<T>::change_text(char* text) {
text_to_parse = Glib::ustring(text);
text_to_parse = Utf8String(text);
segments.clear();
items.clear();
}
Expand Down Expand Up @@ -178,17 +179,27 @@ void Parser<T>::parse_against_index(Index<T> & index) {
// borderline :p)

while (startPosition < text_to_parse.size()) {
Glib::ustring longestMatchBlock = "";
std::string longestMatchBlock = "";
T* longestMatchItem = NULL;
// we use this because we can't rely on longestMatchBlock.size()
// to give us the number of utf-8 character, as it return simply
// a number of byte, and in utf-8:
// number of byte != number of character
size_t sizeLongestMatchBlock = 0;

std::string tempBlock = text_to_parse.substr(startPosition,1);
std::string tempBlock = text_to_parse.substr(startPosition, 1);

T* tempItem = index.get_item(tempBlock);

if (tempItem != NULL) {
// normal mode try to find the longest word starting
// at the end of the previous one
for (int i = 1; i <= MIN(text_to_parse.size()-startPosition, WINDOW) ; i++) {
size_t maxSubstringSize = std::min(
text_to_parse.size() - startPosition,
static_cast<size_t>(WINDOW)
);

for (int i = 1; i <= maxSubstringSize; i++) {

tempBlock = text_to_parse.substr(startPosition,i);

Expand All @@ -198,6 +209,7 @@ void Parser<T>::parse_against_index(Index<T> & index) {
// so we consider it as the current longest one
if (tempItem != NULL) {
longestMatchBlock = tempBlock;
sizeLongestMatchBlock = i;
longestMatchItem = tempItem;
}
}
Expand All @@ -217,6 +229,7 @@ void Parser<T>::parse_against_index(Index<T> & index) {
break;
}

sizeLongestMatchBlock++;
longestMatchBlock += tempBlock;
longestMatchItem = tempItem;
}
Expand All @@ -225,7 +238,7 @@ void Parser<T>::parse_against_index(Index<T> & index) {
segments.push_back(longestMatchBlock);
items.push_back(longestMatchItem);

startPosition += longestMatchBlock.size();
startPosition += sizeLongestMatchBlock;
}

}
Expand Down Expand Up @@ -428,8 +441,8 @@ std::string Parser<T>::change_script() {
*/

template <typename T>
Glib::ustring Parser<T>::convert_trash_segment(Glib::ustring segment, bool toLatin) {
Glib::ustring temp("");
std::string Parser<T>::convert_trash_segment(std::string segment, bool toLatin) {
std::string temp("");
for (int i = 0; i < segment.size() ; i++) {
temp += convert_trash_char(segment.substr(i,1), toLatin);
}
Expand All @@ -442,10 +455,10 @@ Glib::ustring Parser<T>::convert_trash_segment(Glib::ustring segment, bool toLat
*/

template <typename T>
Glib::ustring Parser<T>::convert_trash_char(Glib::ustring trashChar, bool toLatin) {
std::string Parser<T>::convert_trash_char(std::string trashChar, bool toLatin) {

std::map<Glib::ustring, Glib::ustring>::iterator iter;
std::map<Glib::ustring, Glib::ustring> convertMap ;
std::map<std::string, std::string>::iterator iter;
std::map<std::string, std::string> convertMap ;

if (toLatin) {
convertMap = convertChinese2Latin;
Expand All @@ -468,7 +481,7 @@ Glib::ustring Parser<T>::convert_trash_char(Glib::ustring trashChar, bool toLati
*/

template <typename T>
std::vector<Glib::ustring> Parser<T>::get_segments() {
std::vector<std::string> Parser<T>::get_segments() {
return segments;
}

Expand Down
141 changes: 141 additions & 0 deletions src/Utf8String.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#include <iostream>
#include "Utf8String.h"


/**
*
*/
int character_octet_size (const unsigned char firstByte) {

// lead bit is zero, must be a single ascii
if ((firstByte & 0x80 ) == 0 ) {
return 1;
}

// 110x xxxx
if ((firstByte & 0xE0 ) == 0xC0 ) {
return 2;
}
// 1110 xxxx
if ((firstByte & 0xF0 ) == 0xE0 ) {
return 3;
}
// 1111 0xxx
if ((firstByte & 0xF8 ) == 0xF0 ) {
return 4;
}

// Unrecognized lead byte (%02x)\n", firstByte );
return -1;
}

/**
*
*/
Segments create_new_utf8_string (std::string stringToSegment) {

std::string utf8Character;
//4 because that's the max size in byte of a utf-8 character
utf8Character.reserve(4);
Segments segmentedLine;

int characterSize = 0;
for (int i = 0; i < stringToSegment.size(); i++) {
unsigned char byte = stringToSegment[i];
//if we we have finished previous utf8 character
//we take the size of the new one
if (characterSize <= 0) {
characterSize = character_octet_size(byte);
}

//we push current byte in current utf-8 character
//we're building
utf8Character.push_back(byte);
characterSize--;

//if we have consumed all bytes of current utf-8 character
//it is now ready to be added to the list of utf-8 characters
if (characterSize <= 0) {
segmentedLine.push_back(utf8Character);
segment.clear();
}
}

return segmentedLine;
};


/**
*
*/
Utf8String::Utf8String() {

};

/**
*
*/
Utf8String::Utf8String(std::string stringToSegment) {
segments = create_new_utf8_string(stringToSegment);
};

/**
*
*/
Utf8String::Utf8String(char* charToSegment) {
//it should be possible to have something a little more efficient
//by not using an intermediate string, but rather iterating on the array
//of char until we met a \0
std::string stringToSegment(charToSegment);
segments = create_new_utf8_string(stringToSegment);
};

/**
*
*/
Utf8String::Utf8String(const char* charToSegment) {
//it should be possible to have something a little more efficient
//by not using an intermediate string, but rather iterating on the array
//of char until we met a \0
std::string stringToSegment(charToSegment);
segments = create_new_utf8_string(stringToSegment);
};

/**
*
*/
std::string Utf8String::to_string() const {
std::string toOutput;
for (int i = 0; i < segments.size(); i++) {
toOutput += (segments[i]);
}

return toOutput;
};

/**
*
*/
size_t Utf8String::size() const {
return segments.size();
}


/**
*
*/
std::string Utf8String::substr(size_t start, size_t size) const {
std::string toOutput;
for (int i = 0; i < size; i++) {
toOutput += (segments[start + i]);
}
return toOutput;
}


/**
*
*/
std::ostream& operator<< (std::ostream& stream, const Utf8String& utf8String) {
stream << utf8String.to_string();
}
Loading

0 comments on commit fa634b2

Please sign in to comment.