@@ -190,18 +190,20 @@ export class ChromaBm25EmbeddingFunction implements SparseEmbeddingFunction {
190190 }
191191
192192 private encode ( text : string ) : SparseVector {
193- const tokens = this . tokenizer . tokenize ( text ) ;
193+ const tokenList = this . tokenizer . tokenize ( text ) ;
194194
195- if ( tokens . length === 0 ) {
195+ if ( tokenList . length === 0 ) {
196196 return { indices : [ ] , values : [ ] } ;
197197 }
198198
199- const docLen = tokens . length ;
199+ const docLen = tokenList . length ;
200200 const counts = new Map < number , number > ( ) ;
201+ const tokenMap = new Map < number , string > ( ) ;
201202
202- for ( const token of tokens ) {
203+ for ( const token of tokenList ) {
203204 const tokenId = this . hasher . hash ( token ) ;
204205 counts . set ( tokenId , ( counts . get ( tokenId ) ?? 0 ) + 1 ) ;
206+ tokenMap . set ( tokenId , token ) ;
205207 }
206208
207209 const indices = Array . from ( counts . keys ( ) ) . sort ( ( a , b ) => a - b ) ;
@@ -213,8 +215,9 @@ export class ChromaBm25EmbeddingFunction implements SparseEmbeddingFunction {
213215 ( 1 - this . b + ( this . b * docLen ) / this . avgDocLength ) ;
214216 return ( tf * ( this . k + 1 ) ) / denominator ;
215217 } ) ;
218+ const tokens = indices . map ( ( idx ) => tokenMap . get ( idx ) ! ) ;
216219
217- return { indices, values } ;
220+ return { indices, values, tokens } ;
218221 }
219222
220223 public async generate ( texts : string [ ] ) : Promise < SparseVector [ ] > {
0 commit comments