diff --git a/aicirt/src/main.rs b/aicirt/src/main.rs index a2ce80b4..5a7ef23f 100644 --- a/aicirt/src/main.rs +++ b/aicirt/src/main.rs @@ -13,10 +13,12 @@ use crate::{ }; use aici_abi::{ bytes::limit_str, toktree::TokTrie, Branch, MidProcessArg, ProcessResultOffset, SeqId, + TokenizerEnv, }; use aicirt::{bintokens::find_tokenizer, futexshm::ServerChannel, shm::ShmAllocator, *}; use anyhow::{anyhow, ensure, Result}; use base64::{self, Engine as _}; +use bintokens::ByteTokenizerEnv; use clap::Parser; use hex; use hostimpl::GlobalInfo; @@ -1099,15 +1101,17 @@ fn bench_hashmap() { fn save_tokenizer(cli: &Cli) { let filename = cli.save_tokenizer.as_deref().unwrap(); + let tokenizer = find_tokenizer(&cli.tokenizer).unwrap(); - let tokens = tokenizer.token_bytes(); + let env = ByteTokenizerEnv::new(tokenizer); + let tokens = env.tokenizer.token_bytes(); log::info!( "TokTrie building: {:?} wl={}", - tokenizer.tokrx_info(), + env.tokenizer.tokrx_info(), tokens.len() ); - let trie = TokTrie::from(&tokenizer.tokrx_info(), &tokens); + let trie = &env.tok_trie; trie.check_against(&tokens); let bytes = trie.serialize(); @@ -1119,6 +1123,27 @@ fn save_tokenizer(cli: &Cli) { std::fs::write(filename, &bytes).unwrap(); println!("wrote {}, {} bytes", filename, bytes.len()); + + if false { + for (a, abytes) in tokens.iter().enumerate() { + let mut ts = trie.alloc_token_set(); + let a = a as u32; + for (b, bbytes) in tokens.iter().enumerate() { + let b = b as u32; + let mut bytes = abytes.to_vec(); + bytes.extend_from_slice(bbytes); + let r = env.tokenize_bytes(&bytes); + if r.len() == 2 && r[0] == a && r[1] == b { + ts.allow_token(b); + } + } + + let neg = ts.num_set() > 15000; + let ts = if neg { ts.negated() } else { ts }; + let elts = ts.iter().collect::>(); + println!("{a} ==> {neg} {elts:?}"); + } + } } fn install_from_cmdline(cli: &Cli, wasm_ctx: WasmContext, shm: Rc) { diff --git a/scripts/random/parse-tokenizer-automaton.js b/scripts/random/parse-tokenizer-automaton.js new file mode 100644 index 00000000..78d805ac --- /dev/null +++ b/scripts/random/parse-tokenizer-automaton.js @@ -0,0 +1,82 @@ +const fs = require("fs") + +const nevers = [0, 1, 2, 16, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 32000] + +const MIN_DELTA = 0 +const MAX_DELTA = MIN_DELTA + 2 + +function diff(a, b) { + let i = 0 + let j = 0 + let delta = 0 + while (i < a.length || j < b.length) { + if (i < a.length && j < b.length) { + if (a[i] == b[j]) { + i++; + j++; + } else if (a[i] < b[j]) { + delta++; + i++; + } else { + delta++; + j++; + } + } else if (i < a.length) { + delta++; + i++; + } else { + delta++; + j++; + } + + if (delta > MAX_DELTA) { + return delta; + } + } + return delta; +} + +const buckets = [] +let no_bucket_size = 0 + +fs.readFileSync("tmp/tokens.txt", "utf8").split("\n").forEach((line, i) => { + const m = /^(\d+) ==> (true|false) (.*)/.exec(line); + if (!m) return + const tokid = +m[1]; + let elts = Array.from(JSON.parse(m[3])); + const neg = m[2] == "true"; + const isAllowed = (e) => { + if (neg) return !elts.includes(e); + return elts.includes(e); + } + + const nev = nevers.find(e => isAllowed(e)); + if (nev) { + console.log(tokid, "N", nev); + } + + const empty = elts.length == 0 && !neg; + if (empty) { + //console.log(tokid, "E"); + } else { + if (!neg) { + console.log(tokid, "A", elts.length); + } else { + let existing = false + elts = elts.filter(e => !nevers.includes(e)); + for (const b of buckets) { + if (diff(elts, b) <= MIN_DELTA) { + existing = true; + break; + } + } + if (!existing) { + buckets.push(elts); + } + no_bucket_size += elts.length; + console.log(tokid, "F", elts.length, buckets.length); + } + } +}) + +console.log(buckets.reduce((a, b) => a + b.length, 0), no_bucket_size) diff --git a/scripts/tokenizer-stats.js b/scripts/tokenizer-stats.js index 5475b2d2..e65f8d5b 100755 --- a/scripts/tokenizer-stats.js +++ b/scripts/tokenizer-stats.js @@ -133,6 +133,18 @@ function stats(fn) { const ones = byteCounts.map((c, i) => c == 1 ? i : undefined).filter(x => x !== undefined) const zeros = byteCounts.map((c, i) => c == 0 ? i : undefined).filter(x => x !== undefined) console.log("Byte counts:", { ones, zeros }) + + const merged = {} + for (const merge of tokenizer.model.merges) { + const words = merge.split(' '); + if (words.length != 2) { + console.log("Bad merge: ", merge) + } + merged[words[0]] = true + merged[words[1]] = true + } + console.log("Num merges:", tokenizer.model.merges.length) + console.log("Num merged tokens:", Object.keys(merged).length) } function permute(arr) {