Skip to content

Commit

Permalink
working on tokenizer automatons
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jun 20, 2024
1 parent b841346 commit f14d04a
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 3 deletions.
31 changes: 28 additions & 3 deletions aicirt/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ use crate::{
};
use aici_abi::{
bytes::limit_str, toktree::TokTrie, Branch, MidProcessArg, ProcessResultOffset, SeqId,
TokenizerEnv,
};
use aicirt::{bintokens::find_tokenizer, futexshm::ServerChannel, shm::ShmAllocator, *};
use anyhow::{anyhow, ensure, Result};
use base64::{self, Engine as _};
use bintokens::ByteTokenizerEnv;
use clap::Parser;
use hex;
use hostimpl::GlobalInfo;
Expand Down Expand Up @@ -1099,15 +1101,17 @@ fn bench_hashmap() {

fn save_tokenizer(cli: &Cli) {
let filename = cli.save_tokenizer.as_deref().unwrap();

let tokenizer = find_tokenizer(&cli.tokenizer).unwrap();
let tokens = tokenizer.token_bytes();
let env = ByteTokenizerEnv::new(tokenizer);

let tokens = env.tokenizer.token_bytes();
log::info!(
"TokTrie building: {:?} wl={}",
tokenizer.tokrx_info(),
env.tokenizer.tokrx_info(),
tokens.len()
);
let trie = TokTrie::from(&tokenizer.tokrx_info(), &tokens);
let trie = &env.tok_trie;
trie.check_against(&tokens);

let bytes = trie.serialize();
Expand All @@ -1119,6 +1123,27 @@ fn save_tokenizer(cli: &Cli) {

std::fs::write(filename, &bytes).unwrap();
println!("wrote {}, {} bytes", filename, bytes.len());

if false {
for (a, abytes) in tokens.iter().enumerate() {
let mut ts = trie.alloc_token_set();
let a = a as u32;
for (b, bbytes) in tokens.iter().enumerate() {
let b = b as u32;
let mut bytes = abytes.to_vec();
bytes.extend_from_slice(bbytes);
let r = env.tokenize_bytes(&bytes);
if r.len() == 2 && r[0] == a && r[1] == b {
ts.allow_token(b);
}
}

let neg = ts.num_set() > 15000;
let ts = if neg { ts.negated() } else { ts };
let elts = ts.iter().collect::<Vec<_>>();
println!("{a} ==> {neg} {elts:?}");
}
}
}

fn install_from_cmdline(cli: &Cli, wasm_ctx: WasmContext, shm: Rc<ShmAllocator>) {
Expand Down
82 changes: 82 additions & 0 deletions scripts/random/parse-tokenizer-automaton.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
const fs = require("fs")

const nevers = [0, 1, 2, 16, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 32000]

const MIN_DELTA = 0
const MAX_DELTA = MIN_DELTA + 2

function diff(a, b) {
let i = 0
let j = 0
let delta = 0
while (i < a.length || j < b.length) {
if (i < a.length && j < b.length) {
if (a[i] == b[j]) {
i++;
j++;
} else if (a[i] < b[j]) {
delta++;
i++;
} else {
delta++;
j++;
}
} else if (i < a.length) {
delta++;
i++;
} else {
delta++;
j++;
}

if (delta > MAX_DELTA) {
return delta;
}
}
return delta;
}

const buckets = []
let no_bucket_size = 0

fs.readFileSync("tmp/tokens.txt", "utf8").split("\n").forEach((line, i) => {
const m = /^(\d+) ==> (true|false) (.*)/.exec(line);
if (!m) return
const tokid = +m[1];
let elts = Array.from(JSON.parse(m[3]));
const neg = m[2] == "true";
const isAllowed = (e) => {
if (neg) return !elts.includes(e);
return elts.includes(e);
}

const nev = nevers.find(e => isAllowed(e));
if (nev) {
console.log(tokid, "N", nev);
}

const empty = elts.length == 0 && !neg;
if (empty) {
//console.log(tokid, "E");
} else {
if (!neg) {
console.log(tokid, "A", elts.length);
} else {
let existing = false
elts = elts.filter(e => !nevers.includes(e));
for (const b of buckets) {
if (diff(elts, b) <= MIN_DELTA) {
existing = true;
break;
}
}
if (!existing) {
buckets.push(elts);
}
no_bucket_size += elts.length;
console.log(tokid, "F", elts.length, buckets.length);
}
}
})

console.log(buckets.reduce((a, b) => a + b.length, 0), no_bucket_size)
12 changes: 12 additions & 0 deletions scripts/tokenizer-stats.js
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,18 @@ function stats(fn) {
const ones = byteCounts.map((c, i) => c == 1 ? i : undefined).filter(x => x !== undefined)
const zeros = byteCounts.map((c, i) => c == 0 ? i : undefined).filter(x => x !== undefined)
console.log("Byte counts:", { ones, zeros })

const merged = {}
for (const merge of tokenizer.model.merges) {
const words = merge.split(' ');
if (words.length != 2) {
console.log("Bad merge: ", merge)
}
merged[words[0]] = true
merged[words[1]] = true
}
console.log("Num merges:", tokenizer.model.merges.length)
console.log("Num merged tokens:", Object.keys(merged).length)
}

function permute(arr) {
Expand Down

0 comments on commit f14d04a

Please sign in to comment.