From fa634b22d649e6d9b65fec024ae14d5dfe50e3c5 Mon Sep 17 00:00:00 2001 From: Allan Simon Date: Mon, 7 Apr 2014 13:45:48 +0800 Subject: [PATCH] close #4, now we don't rely anymore on ligmm++, so should compile fine on most architecture --- CMakeLists.txt | 7 --- README.md | 1 - src/Parser.h | 93 +++++++++++++++++------------- src/Utf8String.cpp | 141 +++++++++++++++++++++++++++++++++++++++++++++ src/Utf8String.h | 56 ++++++++++++++++++ src/main.cpp | 1 - 6 files changed, 250 insertions(+), 49 deletions(-) create mode 100644 src/Utf8String.cpp create mode 100644 src/Utf8String.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b78367..66a8f4e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,12 +2,6 @@ cmake_minimum_required(VERSION 2.6) project(sinoparserd) include_directories( - /usr/lib/i386-linux-gnu/glib-2.0/include/ - /usr/include/glibmm-2.4 - /usr/lib/glib-2.0/include/ - /usr/lib/glibmm-2.4/include - /usr/include/glib-2.0 - /usr/lib/i386-linux-gnu/glibmm-2.4/include/ src ) @@ -27,5 +21,4 @@ target_link_libraries( stdc++ expat event - glibmm-2.4 ) diff --git a/README.md b/README.md index 5f32998..d021d72 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ The data files are provided only as example ### Requirement ### * libexpat - * libgmm++ * libevent ### Usage ### diff --git a/src/Parser.h b/src/Parser.h index cbe9b6d..38c862a 100644 --- a/src/Parser.h +++ b/src/Parser.h @@ -1,11 +1,12 @@ #ifndef SINOPARSER_PARSER_H #define SINOPARSER_PARSER_H -#include #include #include +#include #include "Index.h" #include "Database.h" +#include "Utf8String.h" #define TRADITIONAL_SCRIPT true #define SIMPLIFIED_SCRIPT false @@ -14,16 +15,16 @@ template class Parser{ private: - Glib::ustring text_to_parse; - std::vector segments; + Utf8String text_to_parse; + std::vector segments; std::vector items; - std::map convertChinese2Latin; - std::map convertLatin2Chinese; + std::map convertChinese2Latin; + std::map convertLatin2Chinese; inline void init_maps(); - Glib::ustring convert_trash_segment(Glib::ustring segment, bool toLatin); - Glib::ustring convert_trash_char(Glib::ustring trashChar, bool toLatin); + std::string convert_trash_segment(std::string segment, bool toLatin); + std::string convert_trash_char(std::string trashChar, bool toLatin); inline std::string romanize_segment(int segmentNbr); @@ -32,7 +33,7 @@ class Parser{ public: Parser(); - Parser(Glib::ustring text); + Parser(Utf8String text); Parser(char* text); void change_text(char* text); @@ -42,7 +43,7 @@ class Parser{ std::string trad(); std::string simp(); std::string change_script(); - std::vector get_segments(); + std::vector get_segments(); bool guess_script(); void parse_against_index(Index& index); @@ -72,7 +73,7 @@ Parser::Parser() { * */ template -Parser::Parser(Glib::ustring text) { +Parser::Parser(Utf8String text) { text_to_parse = text; init_maps(); } @@ -82,7 +83,7 @@ Parser::Parser(Glib::ustring text) { */ template Parser::Parser(char* text) { - text_to_parse = Glib::ustring(text); + text_to_parse = Utf8String(text); init_maps(); } @@ -92,24 +93,24 @@ Parser::Parser(char* text) { */ template inline void Parser::init_maps() { - convertChinese2Latin.insert(std::pair("。",".")); - convertChinese2Latin.insert(std::pair("、",",")); - convertChinese2Latin.insert(std::pair("?","?")); - convertChinese2Latin.insert(std::pair(",",",")); - convertChinese2Latin.insert(std::pair("!","!")); - convertChinese2Latin.insert(std::pair(";",";")); - convertChinese2Latin.insert(std::pair(":",":")); - convertChinese2Latin.insert(std::pair("‘","'")); - convertChinese2Latin.insert(std::pair("“","\"")); - convertChinese2Latin.insert(std::pair("”","\"")); - - convertLatin2Chinese.insert(std::pair(".","。")); - convertLatin2Chinese.insert(std::pair("?","?")); - convertLatin2Chinese.insert(std::pair(",",",")); - convertLatin2Chinese.insert(std::pair("!","!")); - convertLatin2Chinese.insert(std::pair(";",";")); - convertLatin2Chinese.insert(std::pair(":",":")); - convertLatin2Chinese.insert(std::pair("\"","“")); + convertChinese2Latin.insert(std::pair("。",".")); + convertChinese2Latin.insert(std::pair("、",",")); + convertChinese2Latin.insert(std::pair("?","?")); + convertChinese2Latin.insert(std::pair(",",",")); + convertChinese2Latin.insert(std::pair("!","!")); + convertChinese2Latin.insert(std::pair(";",";")); + convertChinese2Latin.insert(std::pair(":",":")); + convertChinese2Latin.insert(std::pair("‘","'")); + convertChinese2Latin.insert(std::pair("“","\"")); + convertChinese2Latin.insert(std::pair("”","\"")); + + convertLatin2Chinese.insert(std::pair(".","。")); + convertLatin2Chinese.insert(std::pair("?","?")); + convertLatin2Chinese.insert(std::pair(",",",")); + convertLatin2Chinese.insert(std::pair("!","!")); + convertLatin2Chinese.insert(std::pair(";",";")); + convertLatin2Chinese.insert(std::pair(":",":")); + convertLatin2Chinese.insert(std::pair("\"","“")); } @@ -120,7 +121,7 @@ inline void Parser::init_maps() { template void Parser::change_text(char* text) { - text_to_parse = Glib::ustring(text); + text_to_parse = Utf8String(text); segments.clear(); items.clear(); } @@ -178,17 +179,27 @@ void Parser::parse_against_index(Index & index) { // borderline :p) while (startPosition < text_to_parse.size()) { - Glib::ustring longestMatchBlock = ""; + std::string longestMatchBlock = ""; T* longestMatchItem = NULL; + // we use this because we can't rely on longestMatchBlock.size() + // to give us the number of utf-8 character, as it return simply + // a number of byte, and in utf-8: + // number of byte != number of character + size_t sizeLongestMatchBlock = 0; - std::string tempBlock = text_to_parse.substr(startPosition,1); + std::string tempBlock = text_to_parse.substr(startPosition, 1); T* tempItem = index.get_item(tempBlock); if (tempItem != NULL) { // normal mode try to find the longest word starting // at the end of the previous one - for (int i = 1; i <= MIN(text_to_parse.size()-startPosition, WINDOW) ; i++) { + size_t maxSubstringSize = std::min( + text_to_parse.size() - startPosition, + static_cast(WINDOW) + ); + + for (int i = 1; i <= maxSubstringSize; i++) { tempBlock = text_to_parse.substr(startPosition,i); @@ -198,6 +209,7 @@ void Parser::parse_against_index(Index & index) { // so we consider it as the current longest one if (tempItem != NULL) { longestMatchBlock = tempBlock; + sizeLongestMatchBlock = i; longestMatchItem = tempItem; } } @@ -217,6 +229,7 @@ void Parser::parse_against_index(Index & index) { break; } + sizeLongestMatchBlock++; longestMatchBlock += tempBlock; longestMatchItem = tempItem; } @@ -225,7 +238,7 @@ void Parser::parse_against_index(Index & index) { segments.push_back(longestMatchBlock); items.push_back(longestMatchItem); - startPosition += longestMatchBlock.size(); + startPosition += sizeLongestMatchBlock; } } @@ -428,8 +441,8 @@ std::string Parser::change_script() { */ template -Glib::ustring Parser::convert_trash_segment(Glib::ustring segment, bool toLatin) { - Glib::ustring temp(""); +std::string Parser::convert_trash_segment(std::string segment, bool toLatin) { + std::string temp(""); for (int i = 0; i < segment.size() ; i++) { temp += convert_trash_char(segment.substr(i,1), toLatin); } @@ -442,10 +455,10 @@ Glib::ustring Parser::convert_trash_segment(Glib::ustring segment, bool toLat */ template -Glib::ustring Parser::convert_trash_char(Glib::ustring trashChar, bool toLatin) { +std::string Parser::convert_trash_char(std::string trashChar, bool toLatin) { - std::map::iterator iter; - std::map convertMap ; + std::map::iterator iter; + std::map convertMap ; if (toLatin) { convertMap = convertChinese2Latin; @@ -468,7 +481,7 @@ Glib::ustring Parser::convert_trash_char(Glib::ustring trashChar, bool toLati */ template -std::vector Parser::get_segments() { +std::vector Parser::get_segments() { return segments; } diff --git a/src/Utf8String.cpp b/src/Utf8String.cpp new file mode 100644 index 0000000..782a976 --- /dev/null +++ b/src/Utf8String.cpp @@ -0,0 +1,141 @@ +#include +#include "Utf8String.h" + + +/** + * + */ +int character_octet_size (const unsigned char firstByte) { + + // lead bit is zero, must be a single ascii + if ((firstByte & 0x80 ) == 0 ) { + return 1; + } + + // 110x xxxx + if ((firstByte & 0xE0 ) == 0xC0 ) { + return 2; + } + // 1110 xxxx + if ((firstByte & 0xF0 ) == 0xE0 ) { + return 3; + } + // 1111 0xxx + if ((firstByte & 0xF8 ) == 0xF0 ) { + return 4; + } + + // Unrecognized lead byte (%02x)\n", firstByte ); + return -1; +} + +/** + * + */ +Segments create_new_utf8_string (std::string stringToSegment) { + + std::string utf8Character; + //4 because that's the max size in byte of a utf-8 character + utf8Character.reserve(4); + Segments segmentedLine; + + int characterSize = 0; + for (int i = 0; i < stringToSegment.size(); i++) { + unsigned char byte = stringToSegment[i]; + //if we we have finished previous utf8 character + //we take the size of the new one + if (characterSize <= 0) { + characterSize = character_octet_size(byte); + } + + //we push current byte in current utf-8 character + //we're building + utf8Character.push_back(byte); + characterSize--; + + //if we have consumed all bytes of current utf-8 character + //it is now ready to be added to the list of utf-8 characters + if (characterSize <= 0) { + segmentedLine.push_back(utf8Character); + segment.clear(); + } + } + + return segmentedLine; +}; + + +/** + * + */ +Utf8String::Utf8String() { + +}; + +/** + * + */ +Utf8String::Utf8String(std::string stringToSegment) { + segments = create_new_utf8_string(stringToSegment); +}; + +/** + * + */ +Utf8String::Utf8String(char* charToSegment) { + //it should be possible to have something a little more efficient + //by not using an intermediate string, but rather iterating on the array + //of char until we met a \0 + std::string stringToSegment(charToSegment); + segments = create_new_utf8_string(stringToSegment); +}; + +/** + * + */ +Utf8String::Utf8String(const char* charToSegment) { + //it should be possible to have something a little more efficient + //by not using an intermediate string, but rather iterating on the array + //of char until we met a \0 + std::string stringToSegment(charToSegment); + segments = create_new_utf8_string(stringToSegment); +}; + +/** + * + */ +std::string Utf8String::to_string() const { + std::string toOutput; + for (int i = 0; i < segments.size(); i++) { + toOutput += (segments[i]); + } + + return toOutput; +}; + +/** + * + */ +size_t Utf8String::size() const { + return segments.size(); +} + + +/** + * + */ +std::string Utf8String::substr(size_t start, size_t size) const { + std::string toOutput; + for (int i = 0; i < size; i++) { + toOutput += (segments[start + i]); + } + return toOutput; +} + + +/** + * + */ +std::ostream& operator<< (std::ostream& stream, const Utf8String& utf8String) { + stream << utf8String.to_string(); +} diff --git a/src/Utf8String.h b/src/Utf8String.h new file mode 100644 index 0000000..db1508e --- /dev/null +++ b/src/Utf8String.h @@ -0,0 +1,56 @@ +#include +#include +#include + +/** + * Use the first byte of a utf-8 character to determine how long + * in byte this character will be, return that size + * or -1 if the first byte is a non valid utf-8 character first byte + */ +int character_octet_size (const unsigned char firstByte); + +typedef std::vector Segments; + +/** + * segment a standard string into a vector of utf-8 character + */ +Segments create_new_utf8_string (std::string); + + +/** + * Minimal String-like class to represent a utf-8 string + * in order to work on a per-character basis rather than per byte + */ +class Utf8String { + private: + Segments segments; + + public: + Utf8String(); + Utf8String(std::string); + Utf8String(char*); + Utf8String(const char*); + + /** + * Return a standard string representation of the instance + */ + std::string to_string() const; + + /** + * Equivalent of std::string substr function but works on actual + * characters rathen than bytes + */ + std::string substr(size_t start, size_t size) const; + + /** + * Get the size of the string, in term of number of actual character + * not size in bytes + */ + size_t size() const; +}; + +/** + * Operator overloading so that we can print a Utf8String + * using normal cout << like operations + */ +std::ostream& operator<< (std::ostream& stream, const Utf8String& utf8String); diff --git a/src/main.cpp b/src/main.cpp index 36f04a8..4211b1d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,4 @@ #include -#include #include "Server.h" #include "Args.h" #include "Database.h"