Skip to content

Commit

Permalink
utf8 fix in tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
fieg committed Aug 26, 2014
1 parent 39d89ac commit 22210f4
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class WhitespaceAndPunctuationTokenizer implements TokenizerInterface

public function tokenize($string)
{
$retval = preg_split($this->pattern, mb_strtolower($string));
$retval = preg_split($this->pattern, mb_strtolower($string, 'utf8'));
$retval = array_filter($retval, 'trim');
$retval = array_values($retval);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public function tokenizeDataProvider()
return array(
array('Hello, how are you?', array('hello', 'how', 'are', 'you')),
array("Hello\n\nHow are you?!", array('hello', 'how', 'are', 'you')),
array("Un importante punto de inflexión en la historia de la ciencia filosófica primitiva", array('un','importante','punto','de','inflexión','en','la','historia','de','la','ciencia','filosófica','primitiva')),
);
}
}

0 comments on commit 22210f4

Please sign in to comment.