From fa634b22d649e6d9b65fec024ae14d5dfe50e3c5 Mon Sep 17 00:00:00 2001
From: Allan Simon <simona@gobeta.com.cn>
Date: Mon, 7 Apr 2014 13:45:48 +0800
Subject: [PATCH] close #4, now we don't rely anymore on ligmm++, so should
 compile fine on most architecture

---
 CMakeLists.txt     |   7 ---
 README.md          |   1 -
 src/Parser.h       |  93 +++++++++++++++++-------------
 src/Utf8String.cpp | 141 +++++++++++++++++++++++++++++++++++++++++++++
 src/Utf8String.h   |  56 ++++++++++++++++++
 src/main.cpp       |   1 -
 6 files changed, 250 insertions(+), 49 deletions(-)
 create mode 100644 src/Utf8String.cpp
 create mode 100644 src/Utf8String.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b78367..66a8f4e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,12 +2,6 @@ cmake_minimum_required(VERSION 2.6)
 
 project(sinoparserd)
 include_directories(
-    /usr/lib/i386-linux-gnu/glib-2.0/include/
-    /usr/include/glibmm-2.4
-    /usr/lib/glib-2.0/include/
-    /usr/lib/glibmm-2.4/include
-    /usr/include/glib-2.0
-    /usr/lib/i386-linux-gnu/glibmm-2.4/include/
     src
 )
 
@@ -27,5 +21,4 @@ target_link_libraries(
     stdc++
     expat
     event
-    glibmm-2.4
 )
diff --git a/README.md b/README.md
index 5f32998..d021d72 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@ The data files are provided only as example
 ### Requirement ###
 
   * libexpat
-  * libgmm++
   * libevent
 
 ### Usage ###
diff --git a/src/Parser.h b/src/Parser.h
index cbe9b6d..38c862a 100644
--- a/src/Parser.h
+++ b/src/Parser.h
@@ -1,11 +1,12 @@
 #ifndef SINOPARSER_PARSER_H
 #define SINOPARSER_PARSER_H
 
-#include <glibmm/ustring.h>
 #include <vector>
 #include <map>
+#include <algorithm>
 #include "Index.h"
 #include "Database.h"
+#include "Utf8String.h"
 
 #define TRADITIONAL_SCRIPT true
 #define SIMPLIFIED_SCRIPT false
@@ -14,16 +15,16 @@
 template <typename T>
 class Parser{
     private:
-        Glib::ustring text_to_parse;
-        std::vector<Glib::ustring> segments;
+        Utf8String text_to_parse;
+        std::vector<std::string> segments;
         std::vector<T*> items;
-        std::map<Glib::ustring, Glib::ustring> convertChinese2Latin;
-        std::map<Glib::ustring, Glib::ustring> convertLatin2Chinese;
+        std::map<std::string, std::string> convertChinese2Latin;
+        std::map<std::string, std::string> convertLatin2Chinese;
 
         inline void init_maps();
 
-        Glib::ustring convert_trash_segment(Glib::ustring segment, bool toLatin);
-        Glib::ustring convert_trash_char(Glib::ustring trashChar, bool toLatin);
+        std::string convert_trash_segment(std::string segment, bool toLatin);
+        std::string convert_trash_char(std::string trashChar, bool toLatin);
 
 
         inline std::string romanize_segment(int segmentNbr);
@@ -32,7 +33,7 @@ class Parser{
 
     public:
         Parser();
-        Parser(Glib::ustring text);
+        Parser(Utf8String text);
         Parser(char* text);
 
         void change_text(char* text); 
@@ -42,7 +43,7 @@ class Parser{
         std::string trad();
         std::string simp();
         std::string change_script();
-        std::vector<Glib::ustring> get_segments();
+        std::vector<std::string> get_segments();
 
         bool guess_script();
         void parse_against_index(Index<T>& index);
@@ -72,7 +73,7 @@ Parser<T>::Parser() {
  * 
  */
 template <typename T>
-Parser<T>::Parser(Glib::ustring text) {
+Parser<T>::Parser(Utf8String text) {
     text_to_parse = text;
     init_maps();
 }
@@ -82,7 +83,7 @@ Parser<T>::Parser(Glib::ustring text) {
  */
 template <typename T>
 Parser<T>::Parser(char* text) {
-    text_to_parse = Glib::ustring(text);
+    text_to_parse = Utf8String(text);
     init_maps();
 
 }
@@ -92,24 +93,24 @@ Parser<T>::Parser(char* text) {
  */
 template <typename T>
 inline void Parser<T>::init_maps() {
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("。","."));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("、",","));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("？","?"));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("，",","));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("！","!"));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("；",";"));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("：",":"));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("‘","'"));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("“","\""));
-    convertChinese2Latin.insert(std::pair<Glib::ustring, Glib::ustring>("”","\""));
-
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(".","。"));
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>("?","？"));
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(",","，"));
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>("!","！"));
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(";","；"));
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>(":","："));
-    convertLatin2Chinese.insert(std::pair<Glib::ustring, Glib::ustring>("\"","“"));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("。","."));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("、",","));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("？","?"));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("，",","));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("！","!"));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("；",";"));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("：",":"));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("‘","'"));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("“","\""));
+    convertChinese2Latin.insert(std::pair<std::string, std::string>("”","\""));
+
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>(".","。"));
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>("?","？"));
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>(",","，"));
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>("!","！"));
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>(";","；"));
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>(":","："));
+    convertLatin2Chinese.insert(std::pair<std::string, std::string>("\"","“"));
 
 
 }
@@ -120,7 +121,7 @@ inline void Parser<T>::init_maps() {
 
 template <typename T>
 void Parser<T>::change_text(char* text) {
-    text_to_parse = Glib::ustring(text);
+    text_to_parse = Utf8String(text);
     segments.clear();
     items.clear();
 }
@@ -178,17 +179,27 @@ void Parser<T>::parse_against_index(Index<T> & index) {
     // borderline :p)
 
     while (startPosition < text_to_parse.size()) {
-        Glib::ustring longestMatchBlock = "";
+        std::string longestMatchBlock = "";
         T* longestMatchItem = NULL;
+        // we use this because we can't rely on longestMatchBlock.size()
+        // to give us the number of utf-8 character, as it return simply
+        // a number of byte, and in utf-8:
+        // number of byte != number of character
+        size_t sizeLongestMatchBlock = 0;
 
-        std::string tempBlock = text_to_parse.substr(startPosition,1);
+        std::string tempBlock = text_to_parse.substr(startPosition, 1);
 
         T* tempItem  = index.get_item(tempBlock);
 
         if (tempItem != NULL) {
             // normal mode try to find the longest word starting 
             // at the end of the previous one
-            for (int i = 1; i  <= MIN(text_to_parse.size()-startPosition, WINDOW) ; i++) { 
+            size_t maxSubstringSize = std::min(
+                text_to_parse.size() - startPosition,
+                static_cast<size_t>(WINDOW)
+            );
+
+            for (int i = 1; i <= maxSubstringSize; i++) { 
                 
                 tempBlock = text_to_parse.substr(startPosition,i);
 
@@ -198,6 +209,7 @@ void Parser<T>::parse_against_index(Index<T> & index) {
                 // so we consider it as the current longest one
                 if (tempItem != NULL) {
                     longestMatchBlock = tempBlock;
+                    sizeLongestMatchBlock = i;
                     longestMatchItem  = tempItem;
                 }
             }
@@ -217,6 +229,7 @@ void Parser<T>::parse_against_index(Index<T> & index) {
                     break;
                 }
 
+                sizeLongestMatchBlock++;
                 longestMatchBlock += tempBlock;
                 longestMatchItem  = tempItem;
             }
@@ -225,7 +238,7 @@ void Parser<T>::parse_against_index(Index<T> & index) {
         segments.push_back(longestMatchBlock); 
         items.push_back(longestMatchItem);
 
-        startPosition += longestMatchBlock.size();
+        startPosition += sizeLongestMatchBlock;
     }
    
 }
@@ -428,8 +441,8 @@ std::string Parser<T>::change_script() {
  */
 
 template <typename T>
-Glib::ustring Parser<T>::convert_trash_segment(Glib::ustring segment, bool toLatin) {
-    Glib::ustring temp("");
+std::string Parser<T>::convert_trash_segment(std::string segment, bool toLatin) {
+    std::string temp("");
     for (int i = 0; i < segment.size() ; i++) {
         temp += convert_trash_char(segment.substr(i,1), toLatin); 
     }
@@ -442,10 +455,10 @@ Glib::ustring Parser<T>::convert_trash_segment(Glib::ustring segment, bool toLat
  */
 
 template <typename T>
-Glib::ustring Parser<T>::convert_trash_char(Glib::ustring trashChar, bool toLatin) {
+std::string Parser<T>::convert_trash_char(std::string trashChar, bool toLatin) {
 
-    std::map<Glib::ustring, Glib::ustring>::iterator iter;
-    std::map<Glib::ustring, Glib::ustring> convertMap ;
+    std::map<std::string, std::string>::iterator iter;
+    std::map<std::string, std::string> convertMap ;
 
     if (toLatin) {
         convertMap = convertChinese2Latin;
@@ -468,7 +481,7 @@ Glib::ustring Parser<T>::convert_trash_char(Glib::ustring trashChar, bool toLati
  */
 
 template <typename T>
-std::vector<Glib::ustring> Parser<T>::get_segments() {
+std::vector<std::string> Parser<T>::get_segments() {
     return segments;
 }
 
diff --git a/src/Utf8String.cpp b/src/Utf8String.cpp
new file mode 100644
index 0000000..782a976
--- /dev/null
+++ b/src/Utf8String.cpp
@@ -0,0 +1,141 @@
+#include <iostream>
+#include "Utf8String.h"
+
+
+/**
+ *
+ */
+int character_octet_size (const unsigned char firstByte) {
+
+    // lead bit is zero, must be a single ascii
+    if ((firstByte & 0x80 ) == 0 ) {
+        return 1; 
+    }
+
+    // 110x xxxx
+    if ((firstByte & 0xE0 ) == 0xC0 ) {
+        return 2;
+    }
+    // 1110 xxxx
+    if ((firstByte & 0xF0 ) == 0xE0 ) {
+        return 3;
+    }
+    // 1111 0xxx
+    if ((firstByte & 0xF8 ) == 0xF0 ) {
+        return 4;
+    }
+    
+    // Unrecognized lead byte (%02x)\n", firstByte );
+    return -1;
+}
+
+/**
+ *
+ */
+Segments create_new_utf8_string (std::string stringToSegment) {
+
+    std::string utf8Character;
+    //4 because that's the max size in byte of a utf-8 character
+    utf8Character.reserve(4);
+    Segments segmentedLine;
+
+    int characterSize = 0;
+    for (int i = 0; i < stringToSegment.size(); i++) {
+        unsigned char byte = stringToSegment[i];
+        //if we we have finished previous utf8 character
+        //we take the size of the new one
+        if (characterSize <= 0) {
+            characterSize = character_octet_size(byte);
+        }
+
+        //we push current byte in current utf-8 character
+        //we're building
+        utf8Character.push_back(byte);
+        characterSize--;
+
+        //if we have consumed all bytes of current utf-8 character
+        //it is now ready to be added to the list of utf-8 characters
+        if (characterSize <= 0) {
+            segmentedLine.push_back(utf8Character);
+            segment.clear();
+        }
+    }
+
+    return segmentedLine;
+};
+
+
+/**
+ *
+ */
+Utf8String::Utf8String() {
+
+};
+
+/**
+ *
+ */
+Utf8String::Utf8String(std::string stringToSegment) {
+    segments = create_new_utf8_string(stringToSegment);
+};
+
+/**
+ *
+ */
+Utf8String::Utf8String(char* charToSegment) {
+    //it should be possible to have something a little more efficient
+    //by not using an intermediate string, but rather iterating on the array
+    //of char until we met a \0
+    std::string stringToSegment(charToSegment);
+    segments = create_new_utf8_string(stringToSegment);
+};
+
+/**
+ *
+ */
+Utf8String::Utf8String(const char* charToSegment) {
+    //it should be possible to have something a little more efficient
+    //by not using an intermediate string, but rather iterating on the array
+    //of char until we met a \0
+    std::string stringToSegment(charToSegment);
+    segments = create_new_utf8_string(stringToSegment);
+};
+
+/**
+ *
+ */
+std::string Utf8String::to_string() const {
+    std::string toOutput; 
+    for (int i = 0; i < segments.size(); i++) {
+        toOutput += (segments[i]);
+    }
+
+    return toOutput;
+};
+
+/**
+ *
+ */
+size_t Utf8String::size() const {
+    return segments.size();
+}
+
+
+/**
+ *
+ */
+std::string Utf8String::substr(size_t start, size_t size) const {
+    std::string toOutput; 
+    for (int i = 0; i < size; i++) {
+        toOutput += (segments[start + i]);
+    }
+    return toOutput;
+}
+
+
+/**
+ *
+ */
+std::ostream& operator<< (std::ostream& stream, const Utf8String& utf8String) { 
+    stream << utf8String.to_string();
+}
diff --git a/src/Utf8String.h b/src/Utf8String.h
new file mode 100644
index 0000000..db1508e
--- /dev/null
+++ b/src/Utf8String.h
@@ -0,0 +1,56 @@
+#include <vector>
+#include <string>
+#include <ostream>
+
+/**
+ * Use the first byte of a utf-8 character to determine how long
+ * in byte this character will be,  return that size
+ * or -1 if the first byte is a non valid utf-8 character first byte
+ */
+int character_octet_size (const unsigned char firstByte);
+
+typedef std::vector<std::string> Segments; 
+
+/**
+ * segment a standard string into a vector of utf-8 character
+ */
+Segments create_new_utf8_string (std::string);
+
+
+/**
+ * Minimal String-like class to represent a utf-8 string
+ * in order to work on a per-character basis rather than per byte
+ */
+class Utf8String {
+    private:
+        Segments segments;
+
+    public:
+        Utf8String();
+        Utf8String(std::string);
+        Utf8String(char*);
+        Utf8String(const char*);
+
+        /**
+         * Return a standard string representation of the instance
+         */
+        std::string to_string() const;
+
+        /**
+         * Equivalent of std::string substr function but works on actual
+         * characters rathen than bytes
+         */
+        std::string substr(size_t start, size_t size) const;
+
+        /**
+         * Get the size of the string, in term of number of actual character
+         * not size in bytes
+         */
+        size_t size() const;
+};
+
+/**
+ * Operator overloading so that we can print a Utf8String
+ * using normal cout << like operations
+ */
+std::ostream& operator<< (std::ostream& stream, const Utf8String& utf8String);
diff --git a/src/main.cpp b/src/main.cpp
index 36f04a8..4211b1d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <glibmm/ustring.h>
 #include "Server.h"
 #include "Args.h"
 #include "Database.h"