From 94d247b63e04c4f1659f1e5ce21fa67253660358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Mon, 12 Feb 2024 11:05:25 +0100 Subject: [PATCH 1/4] Update DataDownloader expected size --- benchmark/src/jmh/java/com/knuddels/jtokkit/DataDownloader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/src/jmh/java/com/knuddels/jtokkit/DataDownloader.java b/benchmark/src/jmh/java/com/knuddels/jtokkit/DataDownloader.java index 32f7598..f26dcd2 100644 --- a/benchmark/src/jmh/java/com/knuddels/jtokkit/DataDownloader.java +++ b/benchmark/src/jmh/java/com/knuddels/jtokkit/DataDownloader.java @@ -178,7 +178,7 @@ public static void main(String[] args) throws Exception { } var totalSize = calculateTotalFileSize(rootFolder); - if (totalSize != 99_945_723) { + if (totalSize != 99_945_750) { throw new AssertionError("Total size did not match expected value, actual: " + totalSize); } } From 008fdc146a9b30da3eacb290090e2581a7384776 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Mon, 12 Feb 2024 11:06:54 +0100 Subject: [PATCH 2/4] Simplify TokenEncoders by removing irrelevant tokenCount check --- lib/src/main/java/com/knuddels/jtokkit/TokenEncoder.java | 6 +----- .../main/java/com/knuddels/jtokkit/TokenEncoderLarge.java | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/src/main/java/com/knuddels/jtokkit/TokenEncoder.java b/lib/src/main/java/com/knuddels/jtokkit/TokenEncoder.java index 82afe1b..c80b41e 100644 --- a/lib/src/main/java/com/knuddels/jtokkit/TokenEncoder.java +++ b/lib/src/main/java/com/knuddels/jtokkit/TokenEncoder.java @@ -170,11 +170,7 @@ int mergeBytesAndGetTokenCount(ByteArrayWrapper piece, int length, IntArrayList ranks.set(nextIndex, DUMMY_RANK); length--; - if (length < 3) { - break; // single tokens were already filtered out, let's skip a minimum calculation - } else { - minRankIndex = getMinRankIndex(ranks); - } + minRankIndex = getMinRankIndex(ranks); } assert getMinRankIndex(ranks) < 0; return length; diff --git a/lib/src/main/java/com/knuddels/jtokkit/TokenEncoderLarge.java b/lib/src/main/java/com/knuddels/jtokkit/TokenEncoderLarge.java index ca3fb41..09c215a 100644 --- a/lib/src/main/java/com/knuddels/jtokkit/TokenEncoderLarge.java +++ b/lib/src/main/java/com/knuddels/jtokkit/TokenEncoderLarge.java @@ -28,7 +28,7 @@ static int calculateTokensLarge(TokenEncoder tokenEncoder, int maxTokenCount, bo assert rankMap.containsKey(MAX_RANK); int tokenCount = match.length(); - while (tokenCount > 2 && rankMap.size() > 1) { + while (rankMap.size() > 1) { for (Iterator it = rankMap.pollFirstEntry().getValue().values().iterator(); it.hasNext(); ) { RankNode minNode = it.next(); int minRank = minNode.rank; From 897b071b5c4a447cfc3564eca0c55febf5212fbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Mon, 12 Feb 2024 11:07:43 +0100 Subject: [PATCH 3/4] Make every 50k group possessive to avoid catastrophic backtracking --- lib/src/main/java/com/knuddels/jtokkit/Cl100kParser.java | 2 +- lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/src/main/java/com/knuddels/jtokkit/Cl100kParser.java b/lib/src/main/java/com/knuddels/jtokkit/Cl100kParser.java index c37cc62..b743fae 100644 --- a/lib/src/main/java/com/knuddels/jtokkit/Cl100kParser.java +++ b/lib/src/main/java/com/knuddels/jtokkit/Cl100kParser.java @@ -41,7 +41,7 @@ static void split(String input, Predicate fragmentConsumer) { if ((c0 == '\'') && c1 > 0) { if (isShortContraction(c1)) { - // 1) `'[sdtm]` - contractions, such as the suffixes of `he's`, `I'd`, `'tis`, `I'm` + // 1) `'[sdmt]` - contractions, such as the suffixes of `he's`, `I'd`, `'tis`, `I'm` endIndex += 2; finished = fragmentConsumer.test(addUtf8Bytes(input, startIndex, endIndex, utf8Bytes)); continue; diff --git a/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java b/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java index 63ac676..ab486b8 100644 --- a/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java +++ b/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java @@ -101,7 +101,7 @@ static Encoding p50kEdit() { * @return an {@link Encoding} instance for the cl100k_base encoding */ static Encoding cl100kBase() { - // "'(?:[sdmt]|ll|ve|re)|[^\r\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]++[\r\n]*|\\s*[\r\n]|\\s+(?!\\S)|\\s+" + // "'(?:[sdmt]|ll|ve|re)|[^\r\n\\p{L}\\p{N}]?+\\p{L}++|\\p{N}{1,3}+| ?[^\\s\\p{L}\\p{N}]++[\r\n]*+|\\s*[\r\n]|\\s+(?!\\S)|\\s++" Map mergeableRanks = loadMergeableRanks("/com/knuddels/jtokkit/cl100k_base.tiktoken"); GptBytePairEncodingParams params = new GptBytePairEncodingParams("cl100k_base", null, mergeableRanks, SPECIAL_TOKENS_CL100K_BASE); return new Cl100kGptBytePairEncoding(params); @@ -122,7 +122,7 @@ private static Encoding from50kParameters( String fileName, Map specialTokens ) { - Pattern regex = compileRegex("'(?:[sdmt]|ll|ve|re)| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", false); + Pattern regex = compileRegex("'(?:[sdmt]|ll|ve|re)| ?\\p{L}++| ?\\p{N}++| ?[^\\s\\p{L}\\p{N}]++|\\s+(?!\\S)|\\s++", false); Map mergeableRanks = loadMergeableRanks(fileName); GptBytePairEncodingParams params = new GptBytePairEncodingParams(name, regex, mergeableRanks, specialTokens); return fromParameters(params); From 27f6a861c584db13667c32b41298db99a876ff20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Tue, 13 Feb 2024 12:36:26 +0100 Subject: [PATCH 4/4] Fix whitespace catastrophic backtracking --- lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java b/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java index ab486b8..6aa7bd0 100644 --- a/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java +++ b/lib/src/main/java/com/knuddels/jtokkit/EncodingFactory.java @@ -101,7 +101,7 @@ static Encoding p50kEdit() { * @return an {@link Encoding} instance for the cl100k_base encoding */ static Encoding cl100kBase() { - // "'(?:[sdmt]|ll|ve|re)|[^\r\n\\p{L}\\p{N}]?+\\p{L}++|\\p{N}{1,3}+| ?[^\\s\\p{L}\\p{N}]++[\r\n]*+|\\s*[\r\n]|\\s+(?!\\S)|\\s++" + // "'(?:[sdmt]|ll|ve|re)|[^\r\n\\p{L}\\p{N}]?+\\p{L}++|\\p{N}{1,3}+| ?[^\\s\\p{L}\\p{N}]++[\r\n]*+|\\s++$|\\s*[\r\n]|\\s+(?!\\S)|\\s" Map mergeableRanks = loadMergeableRanks("/com/knuddels/jtokkit/cl100k_base.tiktoken"); GptBytePairEncodingParams params = new GptBytePairEncodingParams("cl100k_base", null, mergeableRanks, SPECIAL_TOKENS_CL100K_BASE); return new Cl100kGptBytePairEncoding(params); @@ -122,7 +122,7 @@ private static Encoding from50kParameters( String fileName, Map specialTokens ) { - Pattern regex = compileRegex("'(?:[sdmt]|ll|ve|re)| ?\\p{L}++| ?\\p{N}++| ?[^\\s\\p{L}\\p{N}]++|\\s+(?!\\S)|\\s++", false); + Pattern regex = compileRegex("'(?:[sdmt]|ll|ve|re)| ?\\p{L}++| ?\\p{N}++| ?[^\\s\\p{L}\\p{N}]++|\\s++$|\\s+(?!\\S)|\\s", false); Map mergeableRanks = loadMergeableRanks(fileName); GptBytePairEncodingParams params = new GptBytePairEncodingParams(name, regex, mergeableRanks, specialTokens); return fromParameters(params);