From f994465f0f5c9304ebb9830926f3df130cf9643a Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Mon, 2 Jan 2023 14:32:06 +0100 Subject: [PATCH] Improve detokenization performance with more recent NLTK versions --- Tokenizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Tokenizer.py b/Tokenizer.py index 1334551..65d3dd0 100644 --- a/Tokenizer.py +++ b/Tokenizer.py @@ -99,11 +99,12 @@ def detokenize(tokenized: List[str]) -> str: Returns: str: The correct string sentence, e.g. "Hello, I'm Tom" """ - indices = [index for index, token in enumerate(tokenized) if token in ("''", "'")] + indices = [index for index, token in enumerate(tokenized) if token in ("''", "'", '"')] + # Replace '' with ", works better with more recent NLTK versions + tokenized_copy = [token if token != "''" else '"' for token in tokenized] # We get the reverse of the enumerate, as we modify the list we took the indices from enumerated = list(enumerate(indices)) - tokenized_copy = deepcopy(tokenized) for i, index in enumerated[::-1]: # Opening quote if i % 2 == 0: