Reset the start_char and end_char on single Word objects if the Token…

… object has start_char and end_char. Will accommodate MWT Tokens which were detected by the tokenizer but not expanded by the MWT model, which can happen with typos such as it"s #1436
stanfordnlp · Nov 27, 2024 · 1a36efb · 1a36efb
1 parent 081d1dc
commit 1a36efb
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -396,7 +396,7 @@ def set_mwt_expansions(self, expansions,
                     word.sent = sentence
                     word.parent = token
                     sentence.words.append(word)
-                if len(token.words) > 1 and token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
+                if token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
                     start_char = token.start_char
                     for word in token.words:
                         end_char = start_char + len(word.text)