Skip to content

Commit

Permalink
Handle special token in TikToken (#528)
Browse files Browse the repository at this point in the history
* Handle special token in TikToken
resolves #525

* remove duplicate method
add clarification comment on implementation
  • Loading branch information
timothycarambat authored Jan 4, 2024
1 parent a2a9037 commit 92da23e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 10 deletions.
2 changes: 1 addition & 1 deletion server/utils/helpers/chat/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ function cannonball({
// if the delta is the token difference between where our prompt is in size
// and where we ideally need to land.
const delta = initialInputSize - targetTokenSize;
const tokenChunks = tokenManager.tokensFromString(input);
const tokenChunks = tokenManager.countFromString(input);
const middleIdx = Math.floor(tokenChunks.length / 2);

// middle truncate the text going left and right of midpoint
Expand Down
14 changes: 5 additions & 9 deletions server/utils/helpers/tiktoken.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,27 @@ const { getEncodingNameForModel, getEncoding } = require("js-tiktoken");
class TokenManager {
constructor(model = "gpt-3.5-turbo") {
this.model = model;
this.encoderName = this.getEncodingFromModel(model);
this.encoderName = this.#getEncodingFromModel(model);
this.encoder = getEncoding(this.encoderName);
this.buffer = 50;
}

getEncodingFromModel(model) {
#getEncodingFromModel(model) {
try {
return getEncodingNameForModel(model);
} catch {
return "cl100k_base";
}
}

tokensFromString(input = "") {
const tokens = this.encoder.encode(input);
return tokens;
}

bytesFromTokens(tokens = []) {
const bytes = this.encoder.decode(tokens);
return bytes;
}

// Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized.
// https://github.com/openai/tiktoken/blob/9e79899bc248d5313c7dd73562b5e211d728723d/tiktoken/core.py#L91C20-L91C38
countFromString(input = "") {
const tokens = this.encoder.encode(input);
const tokens = this.encoder.encode(input, undefined, []);
return tokens.length;
}

Expand Down

0 comments on commit 92da23e

Please sign in to comment.