From 61fc53f0001ba5457cb777f87f7909fe8990bec5 Mon Sep 17 00:00:00 2001 From: mivan Date: Mon, 13 Feb 2017 14:53:37 +0100 Subject: [PATCH] Fixed #21. --- src/quex_modules/definitions.qx | 2 +- src/quex_modules/token.qx | 69 ++++++++++++++++--------- test/test_default_token_enumofnames.txt | 6 +++ 3 files changed, 52 insertions(+), 25 deletions(-) diff --git a/src/quex_modules/definitions.qx b/src/quex_modules/definitions.qx index cc2cad4..4213c67 100644 --- a/src/quex_modules/definitions.qx +++ b/src/quex_modules/definitions.qx @@ -232,7 +232,7 @@ define { // Ez a régi: ([a-zµ¿»¶±¼¾¹³áàăâåäãąæćčçďđðéèêěëęíìîïĺľłńňñóòôöőõøºŕřśšşßťţúùûůüűýźžżþ]) // Megj.: union(\P{Lowercase}, \P{Other_Lowercase} is lehet hogy jó // \G{Lowercase_Letter} helyett -- vajon van különbség? - // UPPER: nagybetűk + // UPPER: nagybetűk // Megj.: Csaba kódja alapján meg ezek voltak benne [§¡£¥¦©ª«¬®¯], // most csak a monogrammokhoz kell az UPPER, ugyhogy kivettem. // Ez a régi: ([A-ZÁÉÍÓÖÕÚÜÛ§¡£¥¦©ª«¬®¯ÀÂÃÄÅÆÇÈÊËÌÎÏÐÑÒÔØÙÝÞ]) diff --git a/src/quex_modules/token.qx b/src/quex_modules/token.qx index ee10647..235059e 100644 --- a/src/quex_modules/token.qx +++ b/src/quex_modules/token.qx @@ -66,7 +66,7 @@ define { //O P E R A T I O N S ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ OPERAND ([0-9]+[.,])?[0-9]+ // decimal fractions - OPERATION_WS [  ] // space (0020), non-breaking space (00A0) + OPERATION_WS [  ] // space (0020), non-breaking space (00A0) OPERATOR {OPERATION_WS}?[*+]{OPERATION_WS}? OPERATION {OPERAND}({OPERATOR}{OPERAND})+ @@ -159,9 +159,51 @@ mode PROGRAM : COMMON { self_send1(token_TOKEN, LEX.c_str()); } + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // enumerate of names with hyphen + // mondatvegi verzio + (\G{Uppercase_Letter}\G{Lowercase_Letter}+)("-"\G{Uppercase_Letter}\G{Lowercase_Letter}+){2,}"."{SNT_CLOSE_QX} { + /* std::wcerr << L"enum of names with hyphen (snt end): " << Lexeme << std::endl; */ + std::wstring LEX(Lexeme, wcslen(Lexeme)-2); + std::wstring res; + for(auto c : LEX) + { + if(c!=L'-') + { + res.push_back(c); + } + else + { + res.append(self.WORD_CLOSE_CPP + self.PUNCT_OPEN_CPP + L"-" + self.PUNCT_CLOSE_CPP + self.WORD_OPEN_CPP); + } + } + res = self.WORD_OPEN_CPP + res + self.WORD_CLOSE_CPP + self.PUNCT_OPEN_CPP + L"." + self.PUNCT_CLOSE_CPP + self.SNT_CLOSE_CPP; + self_send1(token_TOKEN, res.c_str()); + } + // mondatkozi verzio + (\G{Uppercase_Letter}\G{Lowercase_Letter}+)("-"\G{Uppercase_Letter}\G{Lowercase_Letter}+){2,} { + /* std::wcerr << L"enum of names with hyphen: " << Lexeme << std::endl; */ + std::wstring LEX(Lexeme); + std::wstring res; + for(auto c : LEX) + { + if(c!=L'-') + { + res.push_back(c); + } + else + { + res.append(self.WORD_CLOSE_CPP + self.PUNCT_OPEN_CPP + L"-" + self.PUNCT_CLOSE_CPP + self.WORD_OPEN_CPP); + } + } + res = self.WORD_OPEN_CPP + res + self.WORD_CLOSE_CPP; + self_send1(token_TOKEN, res.c_str()); + } + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // operations with decimal fractions {OPERATION}("."{SNT_CLOSE_QX})? { + /* std::wcerr << L"operations: " << Lexeme << std::endl; */ std::wstring LEX(Lexeme); self.operation_processing(LEX); self_send1(token_TOKEN, LEX.c_str()); @@ -203,27 +245,6 @@ mode PROGRAM : COMMON { self_send1(token_PUNCT, LEX.c_str()); } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // enumerate of names with hyphen - ({UPPER}{LOWER}+)("-"{UPPER}{LOWER}+){2,} { - /* std::wcerr << L"enum of names with hyphen: " << Lexeme << std::endl; */ - std::wstring LEX(Lexeme); - std::wstring tmp; - for(auto c : LEX) - { - if(c!=L'-') - { - tmp.push_back(c); - } - else - { - tmp.append(self.WORD_CLOSE_CPP + self.PUNCT_OPEN_CPP + L"-" + self.PUNCT_CLOSE_CPP + self.WORD_OPEN_CPP); - } - } - tmp.swap(LEX); - LEX = self.WORD_OPEN_CPP + LEX + self.WORD_CLOSE_CPP; - self_send1(token_TOKEN, LEX.c_str()); - } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // basic tokenizer rules: @@ -235,14 +256,14 @@ mode PROGRAM : COMMON { // par(en)thesis, but not (parenthesis) // verision for words with "-e" ((({WORDS_WITH_DOTS}?{WORD_IN_PAR})?{WORDS_WITH_DOTS})|({WORDS_WITH_DOTS}?{WORD_IN_PAR}))"-e"("."+)?{SNT_CLOSE_QX}? { - /* std::wcerr << L"basic with -e! " << Lexeme << std::endl; */ + /* std::wcerr << L"basic (-e): " << Lexeme << std::endl; */ std::wstring LEX(Lexeme); self.particula_token_corrig(LEX); self_send1(token_TOKEN, LEX.c_str()); } // version for other cases: ((({WORD_IN_PAR})?{WORDS_WITH_DOTS})|({WORDS_WITH_DOTS}{WORD_IN_PAR}{WORDS_WITH_DOTS}?))("."+{SNT_CLOSE_QX})? { - /* std::wcerr << L"basic! " << Lexeme << std::endl; */ + /* std::wcerr << L"basic: " << Lexeme << std::endl; */ std::wstring LEX(Lexeme); self.basic_token_corrig(LEX); self_send1(token_TOKEN, LEX.c_str()); diff --git a/test/test_default_token_enumofnames.txt b/test/test_default_token_enumofnames.txt index 9e892bb..71c404b 100644 --- a/test/test_default_token_enumofnames.txt +++ b/test/test_default_token_enumofnames.txt @@ -21,3 +21,9 @@ OUT: Nana-Nini-Nene IN : Nana-Nini-Nene-Nono OUT: Nana-Nini-Nene-Nono +IN : Nana-Nini-Nene. +OUT: Nana-Nini-Nene. + +IN : A Nana-Nini-Nene-Nono. +OUT: A Nana-Nini-Nene-Nono. +