diff --git a/README.md b/README.md index fc3197a3..b10f0a09 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ For example, computing allowed token set in the 32000-strong vocabulary of Llama - about 2.0ms for Yacc grammar of the C programming language - about 0.3ms for a regular expression -- about 0.2ms for a substring contraint, from 4kB string +- about 0.2ms for a substring constraint, from 4kB string The above numbers are for a single sequence, however each sequence is processed in separate process, and thus if there is more cores than sequences (which is typical), they do not change. diff --git a/aici_abi/README.md b/aici_abi/README.md index e394b1cd..a15dd336 100644 --- a/aici_abi/README.md +++ b/aici_abi/README.md @@ -1,6 +1,6 @@ # aici_abi -This crate specifies the application binary inferface (ABI) for the AICI Controllers. +This crate specifies the application binary interface (ABI) for the AICI Controllers. It also provides higher-level interfaces for implementing controllers. ## Low-level interface diff --git a/aicirt/src/lib.rs b/aicirt/src/lib.rs index dacefc32..aa9c137f 100644 --- a/aicirt/src/lib.rs +++ b/aicirt/src/lib.rs @@ -24,7 +24,7 @@ pub use fxhash::FxHashSet as HashSet; pub enum LogMode { Normal, Test, - Deamon, + Daemon, } fn daemon_format( @@ -47,7 +47,7 @@ pub fn init_log(mode: LogMode) -> Result<()> { LogMode::Test => { Logger::try_with_env_or_str("debug")?.write_mode(WriteMode::SupportCapture) } - LogMode::Deamon => Logger::try_with_env_or_str("info")? + LogMode::Daemon => Logger::try_with_env_or_str("info")? .format(daemon_format) .log_to_stdout(), }; diff --git a/declctrl/src/declctrl.rs b/declctrl/src/declctrl.rs index 5dac5d1a..4ffd5cfb 100644 --- a/declctrl/src/declctrl.rs +++ b/declctrl/src/declctrl.rs @@ -774,7 +774,7 @@ impl StepState { let sidx = runner.bytes.len() - nbytes; for idx in sidx.saturating_sub(1)..runner.bytes.len().saturating_sub(1) { - if !is_boundry(runner.bytes[idx]) && is_boundry(runner.bytes[idx + 1]) { + if !is_boundary(runner.bytes[idx]) && is_boundary(runner.bytes[idx + 1]) { self.num_words += 1; break; } @@ -828,7 +828,7 @@ impl StepState { return None; - fn is_boundry(b: u8) -> bool { + fn is_boundary(b: u8) -> bool { b == b' ' || b == b'\n' || b == b'\t' } } diff --git a/jsctrl/README.md b/jsctrl/README.md index e8200661..70a13c84 100644 --- a/jsctrl/README.md +++ b/jsctrl/README.md @@ -15,7 +15,7 @@ and use the native constraints. This is quite similar to [PyCtrl](../pyctrl/README.md) but with JavaScript instead of Python. It is also smaller, at 1.3MiB without regex and CFG, 1.8MiB with regex, and 3.3MiB with regex and CFG. -For comparision, pyctrl is 14MiB. +For comparison, pyctrl is 14MiB. Also, the [PyCtrl samples](../pyctrl/samples/) translate 1:1 to JsCtrl. ## Usage diff --git a/pyaici/cli.py b/pyaici/cli.py index 31007a82..ab6d9165 100644 --- a/pyaici/cli.py +++ b/pyaici/cli.py @@ -121,7 +121,7 @@ def infer_args(cmd: argparse.ArgumentParser): cmd.add_argument( "--temperature", type=float, - help="temperature for sampling; deflaut 0.0 (argmax)", + help="temperature for sampling; default 0.0 (argmax)", ) @@ -306,8 +306,8 @@ def main_inner(): jsinit_cmd = subparsers.add_parser( "jsinit", - help="intialize current folder for jsctrl", - description="Intialize a JavaScript/TypeScript folder for jsctrl.", + help="initialize current folder for jsctrl", + description="Initialize a JavaScript/TypeScript folder for jsctrl.", ) jsinit_cmd.add_argument( "--force", "-f", action="store_true", help="overwrite existing files" diff --git a/pyaici/server.py b/pyaici/server.py index 57d76412..8c4854fd 100644 --- a/pyaici/server.py +++ b/pyaici/server.py @@ -404,7 +404,7 @@ async def wrap(): class Label: def __init__(self): """ - Create a new label the indictes the current position in the sequence. + Create a new label the indicates the current position in the sequence. Can be passed as `following=` argument to `FixedTokens()`. """ self.ptr = len(get_tokens()) diff --git a/pyaici/server_native.py b/pyaici/server_native.py index 74d3db7c..3a67bea8 100644 --- a/pyaici/server_native.py +++ b/pyaici/server_native.py @@ -66,7 +66,7 @@ def eos_token() -> int: class TokenSet(Sequence[bool]): """ Represents a set of tokens. - The value is true at indicies corresponding to tokens in the set. + The value is true at indices corresponding to tokens in the set. """ def __init__(self): diff --git a/rllm-cuda/scripts/convert.py b/rllm-cuda/scripts/convert.py index f19f2fad..a435e8c6 100644 --- a/rllm-cuda/scripts/convert.py +++ b/rllm-cuda/scripts/convert.py @@ -66,7 +66,7 @@ def _remove_duplicate_names( keep_name = sorted(list(complete_names))[0] - # Mecanism to preferentially select keys to keep + # Mechanism to preferentially select keys to keep # coming from the on-disk file to allow # loading models saved with a different choice # of keep_name @@ -99,7 +99,7 @@ def get_discard_names(model_id: str, revision: Optional[str], folder: str, token class_ = getattr(transformers, architecture) - # Name for this varible depends on transformers version. + # Name for this variable depends on transformers version. discard_names = getattr(class_, "_tied_weights_keys", []) except Exception: diff --git a/rllm-cuda/src/paged/blocks.rs b/rllm-cuda/src/paged/blocks.rs index 03b8f288..70593512 100644 --- a/rllm-cuda/src/paged/blocks.rs +++ b/rllm-cuda/src/paged/blocks.rs @@ -66,7 +66,7 @@ impl BlockRef { } } - pub fn is_singlular(&self) -> bool { + pub fn is_singular(&self) -> bool { let mut alloc = self.allocator.lock().unwrap(); let blk = &mut alloc.all_blocks[self.block_idx]; assert!(blk.ref_count > 0); @@ -206,7 +206,7 @@ impl BlockSpaceManager { let block_idx = ptr / self.block_size; if block_idx < block_table.len() { let curr_block = &mut block_table[block_idx]; - if !curr_block.is_singlular() { + if !curr_block.is_singular() { let new_block = self.alloc_gpu(); let old_block_number = curr_block.block_idx; let new_block_number = new_block.block_idx; diff --git a/rllm-cuda/src/server/mod.rs b/rllm-cuda/src/server/mod.rs index 6d8cbdca..14a81c1d 100644 --- a/rllm-cuda/src/server/mod.rs +++ b/rllm-cuda/src/server/mod.rs @@ -541,7 +541,7 @@ pub async fn server_main(mut args: RllmCliArgs) -> () { None => {} } aicirt::init_log(if args.daemon { - aicirt::LogMode::Deamon + aicirt::LogMode::Daemon } else { aicirt::LogMode::Normal }) diff --git a/tch-cuda/kernels/flash_attn/flash_api.cpp b/tch-cuda/kernels/flash_attn/flash_api.cpp index d0dcc697..b1c09954 100644 --- a/tch-cuda/kernels/flash_attn/flash_api.cpp +++ b/tch-cuda/kernels/flash_attn/flash_api.cpp @@ -301,7 +301,7 @@ mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size const int head_size_og = sizes[3]; const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); - TORCH_CHECK(batch_size > 0, "batch size must be postive"); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); @@ -1164,7 +1164,7 @@ mha_fwd_kvcache(at::Tensor &q, // batch_size x seqlen_q x num_he const int seqlen_k = kcache.size(1); const int num_heads_k = kcache.size(2); const int batch_size_c = kcache.size(0); - TORCH_CHECK(batch_size > 0, "batch size must be postive"); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); diff --git a/tch-cuda/kernels/flash_attn/flash_fwd_kernel.h b/tch-cuda/kernels/flash_attn/flash_fwd_kernel.h index 67ee5d0d..a6f8c883 100644 --- a/tch-cuda/kernels/flash_attn/flash_fwd_kernel.h +++ b/tch-cuda/kernels/flash_attn/flash_fwd_kernel.h @@ -1201,7 +1201,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params ¶ms) { Shape>{}, Stride<_1>{}); constexpr int kNLsePerThread = (kMaxSplits * kBlockM + kNThreads - 1) / kNThreads; - // Read the LSE values from gmem and store them in shared memory, then tranpose them. + // Read the LSE values from gmem and store them in shared memory, then transpose them. constexpr int kRowsPerLoadLSE = kNThreads / kBlockM; #pragma unroll for (int l = 0; l < kNLsePerThread; ++l) { diff --git a/tch-cuda/kernels/flash_attn/kernel_traits.h b/tch-cuda/kernels/flash_attn/kernel_traits.h index f000ff24..5de066c0 100644 --- a/tch-cuda/kernels/flash_attn/kernel_traits.h +++ b/tch-cuda/kernels/flash_attn/kernel_traits.h @@ -181,7 +181,7 @@ struct Flash_fwd_kernel_traits : public Base { Layout>{})); // Val layout, 8 vals per load }; -// Is_V_in_regs is an option to reduce smem usage, but will increase register pressue. +// Is_V_in_regs is an option to reduce smem usage, but will increase register pressure. // No_double_buffer is another option to reduce smem usage, but will slow things down. template