diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py index e0631c19f3cb..d6fc90ac5665 100644 --- a/rasa/nlu/tokenizers/jieba_tokenizer.py +++ b/rasa/nlu/tokenizers/jieba_tokenizer.py @@ -101,7 +101,15 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) tokenized = jieba.tokenize(text) - tokens = [Token(word, start) for (word, start, end) in tokenized] + tokens = [] + current_position = 0 + for word, start, end in tokenized: + if word.strip() == "": + continue + word_start = text.find(word, current_position) + word_end = word_start + len(word) + tokens.append(Token(word, word_start, word_end)) + current_position = word_end return self._apply_token_pattern(tokens) diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py index c0628f901a87..f2d93471d84a 100644 --- a/tests/nlu/tokenizers/test_jieba_tokenizer.py +++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py @@ -37,6 +37,11 @@ def create_jieba(config: Optional[Dict] = None) -> JiebaTokenizer: ["Micheal", "你好", "吗", "?"], [(0, 7), (7, 9), (9, 10), (10, 11)], ), + ( + "安装 rasa 应用", + ["安装", "rasa", "应用"], + [(0, 2), (3, 7), (8, 10)], + ), ], ) def test_jieba(text, expected_tokens, expected_indices):