From 9c30dbe5d2a3868d45da3d29468db583a4986ecb Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Fri, 29 Dec 2023 15:54:07 -0500 Subject: [PATCH] BM25 tokenizer lowercase (#9745) --- llama_index/retrievers/bm25_retriever.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_index/retrievers/bm25_retriever.py b/llama_index/retrievers/bm25_retriever.py index d1fcf45601606..61cc5be388e1c 100644 --- a/llama_index/retrievers/bm25_retriever.py +++ b/llama_index/retrievers/bm25_retriever.py @@ -15,6 +15,8 @@ def tokenize_remove_stopwords(text: str) -> List[str]: + # lowercase and stem words + text = text.lower() stemmer = PorterStemmer() words = list(simple_extract_keywords(text)) return [stemmer.stem(word) for word in words]