From cad56462a02bf675f1380a914c52e2b0e12f145d Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 15 Dec 2023 09:33:30 +0100
Subject: [PATCH] llama: update llama.cpp to latest version (#244)

* llama: update llama.cpp to latest version

This commit updates llama.cpp to the latest/later version.

The motivation for this is that the current version of llama.cpp is
a little outdated and there have been changes to the llama.cpp API and
also the model format. Currently it is not possible to use the new GGUF
format and many of the available models are in this new format which
can make it challenging to use this crate at the moment.

The following changes have been made:
* update llama.cpp to latest version using
  git submodule update --remote --merge llama.cpp

* Manually copied the generated bindings.rs file from the target
  directory to the src directory. Hope this was the correct thing to do.

* Updated the llm-chain-llama crate to use llama_decode instead of
  llm_eval which has now been deprecated.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* build: increase timeout to 30 mins for ci jobs

This is an attempt to fix builds from returning:

```
Error: The operation was canceled.
```

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* build: add fail-fast: false to build strategy

This is an attempt to prevent the currently failing windows build from
causing the other builds to be cancelled (at least that is what I think
is happening).

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* hnsw: update to hnsw_rs 0.2

This commit is an attempt to update the hnsw dependency to version 0.2
and to fix the currently failing windows build of hswn_rs
version 0.1.19 which is current failing to compile on windows:

```console
error[E0308]: mismatched types
   --> C:\Users\runneradmin\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hnsw_rs-0.1.19\src\libext.rs:439:39
    |
439 |     let c_dist = DistCFFI::<f32>::new(c_func);
    |                  -------------------- ^^^^^^ expected `u32`, found `u64`
    |                  |
    |                  arguments to this function are incorrect
    |
    = note: expected fn pointer `extern "C" fn(_, _, u32) -> _`
               found fn pointer `extern "C" fn(_, _, u64) -> _`
note: associated function defined here
   --> C:\Users\runneradmin\.cargo\registry\src\index.crates.io-6f17d22bba15001f\hnsw_rs-0.1.19\src\dist.rs:990:12
    |
990 |     pub fn new(f:DistCFnPtr<T>) -> Self {
    |            ^^^ ---------------
```

I was able to reproduce this issue locally by cross compiling (which
produces the above error). But cross compiling with version 0.2 work and
so I've attempted to upgrade to that version.

This is very much a suggestion as I'm not familiar with the hnsw code
but perhaps it will be useful to someone else and save some time
investigating the issue.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* build: write bindings to src directory

This commit changes the build.rs script to write the generated
bindings to the src directory to avoid manual copying of the
bindings.rs file.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* doc: add instructions for updating llama.cpp

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* src: update llama.cpp submodule

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! llama: update llama.cpp to latest version

This commit revert the change to the StopSequence option in
llm-chain-llama/src/options.rs.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! llama: update llama.cpp to latest version

This commit removes the `From<llama_batch> for LlamaBatch` impl as it
is no longer needed.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! llama: update llama.cpp to latest version

This commit creates a new LlamaBatch for new token sampled instead of
reusing the same one.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! llama: update llama.cpp to latest version

This commit extracts the logic for checking if the prompt is a question
into a separate conditional check. I've tried to clarify the comment of
this check as well so it is hopefully easier to understand now.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 .github/workflows/cicd.yaml                   |    2 +
 Cargo.lock                                    |  164 +-
 crates/llm-chain-hnsw/Cargo.toml              |    2 +-
 crates/llm-chain-hnsw/examples/dump_load.rs   |   17 +-
 crates/llm-chain-hnsw/src/lib.rs              |   54 +-
 crates/llm-chain-llama-sys/README.md          |    7 +
 crates/llm-chain-llama-sys/build.rs           |    7 +-
 crates/llm-chain-llama-sys/llama.cpp          |    2 +-
 crates/llm-chain-llama-sys/src/bindings.rs    | 3998 ++++++++++++++---
 crates/llm-chain-llama/examples/alpaca.rs     |    2 +-
 crates/llm-chain-llama/examples/few_shot.rs   |   10 +-
 .../examples/map_reduce_llama.rs              |   27 +-
 .../llm-chain-llama/examples/simple_llama.rs  |    2 +-
 crates/llm-chain-llama/examples/stream.rs     |    5 +-
 crates/llm-chain-llama/src/batch.rs           |  118 +
 crates/llm-chain-llama/src/context.rs         |  180 +-
 crates/llm-chain-llama/src/executor.rs        |  158 +-
 crates/llm-chain-llama/src/lib.rs             |    3 +
 crates/llm-chain-llama/src/model.rs           |   69 +
 crates/llm-chain-llama/src/options.rs         |   55 +-
 crates/llm-chain-llama/src/tokenizer.rs       |   43 +-
 crates/llm-chain/src/options.rs               |   16 +
 crates/llm-chain/src/traits.rs                |    2 +
 23 files changed, 4080 insertions(+), 863 deletions(-)
 create mode 100644 crates/llm-chain-llama/src/batch.rs
 create mode 100644 crates/llm-chain-llama/src/model.rs
diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml
index 9c8db2b0..a0207b0d 100644
--- a/.github/workflows/cicd.yaml
+++ b/.github/workflows/cicd.yaml
@@ -36,10 +36,12 @@ jobs:
 
   build_and_test:
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-13, windows-latest]
         rust-version: [stable]
     runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
diff --git a/Cargo.lock b/Cargo.lock
index a117b184..dfe6f9eb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,15 @@ checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
 
 [[package]]
 name = "ahash"
-version = "0.4.8"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0453232ace82dee0dd0b4c87a59bd90f7b53b314f3e0f61fe2ee7c8a16482289"
+checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
 
 [[package]]
 name = "aho-corasick"
@@ -38,6 +44,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
 [[package]]
 name = "android-tzdata"
 version = "0.1.1"
@@ -662,6 +674,12 @@ version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6"
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "bytes"
 version = "1.5.0"
@@ -748,6 +766,16 @@ dependencies = [
  "libloading",
 ]
 
+[[package]]
+name = "combine"
+version = "4.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
 [[package]]
 name = "core-foundation"
 version = "0.9.3"
@@ -803,7 +831,7 @@ dependencies = [
  "autocfg",
  "cfg-if",
  "crossbeam-utils",
- "memoffset",
+ "memoffset 0.9.0",
  "scopeguard",
 ]
 
@@ -1008,6 +1036,18 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "enum-as-inner"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.39",
+]
+
 [[package]]
 name = "env_logger"
 version = "0.10.1"
@@ -1336,15 +1376,6 @@ dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "hashbrown"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
-dependencies = [
- "ahash 0.4.8",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1356,6 +1387,10 @@ name = "hashbrown"
 version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156"
+dependencies = [
+ "ahash 0.8.6",
+ "allocator-api2",
+]
 
 [[package]]
 name = "heck"
@@ -1386,16 +1421,18 @@ dependencies = [
 
 [[package]]
 name = "hnsw_rs"
-version = "0.1.19"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0308727459701f2fa18286e50662c37044e130e955bfd42b6ff30260116b2a5"
+checksum = "baf40f00346c339c8181f485ef409e49412649cde8e318cc6804849841ad85f1"
 dependencies = [
+ "anyhow",
  "bincode",
  "cpu-time",
  "env_logger",
- "hashbrown 0.9.1",
+ "hashbrown 0.14.2",
  "lazy_static",
  "log",
+ "mmap-rs",
  "num-traits",
  "num_cpus",
  "parking_lot",
@@ -1972,6 +2009,15 @@ version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
+[[package]]
+name = "mach2"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d0d1830bcd151a6fc4aea1369af235b36c1528fe976b8ff678683c9995eade8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "markdown"
 version = "1.0.0-alpha.14"
@@ -2002,6 +2048,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "memoffset"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.9.0"
@@ -2072,6 +2127,23 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "mmap-rs"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e1af4ac2b44e6faa5d82a400349ccf8444d68559eca4c6f976befc4eee963da"
+dependencies = [
+ "bitflags 1.3.2",
+ "combine",
+ "libc",
+ "mach2",
+ "nix",
+ "sysctl",
+ "thiserror",
+ "widestring",
+ "windows",
+]
+
 [[package]]
 name = "mockall"
 version = "0.11.4"
@@ -2123,6 +2195,19 @@ dependencies = [
  "tempfile",
 ]
 
+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset 0.7.1",
+ "pin-utils",
+]
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3270,6 +3355,20 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
+[[package]]
+name = "sysctl"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "enum-as-inner",
+ "libc",
+ "thiserror",
+ "walkdir",
+]
+
 [[package]]
 name = "system-configuration"
 version = "0.5.1"
@@ -4003,6 +4102,12 @@ dependencies = [
  "rustix",
 ]
 
+[[package]]
+name = "widestring"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "653f141f39ec16bba3c5abe400a0c60da7468261cc2cbf36805022876bc721a8"
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -4034,6 +4139,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.51.1"
@@ -4125,6 +4239,26 @@ version = "0.13.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
 
+[[package]]
+name = "zerocopy"
+version = "0.7.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97e415490559a91254a2979b4829267a57d2fcd741a98eee8b722fb57289aa0"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd7e48ccf166952882ca8bd778a43502c64f33bf94c12ebe2a7f08e5a0f6689f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.39",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.6.0"
diff --git a/crates/llm-chain-hnsw/Cargo.toml b/crates/llm-chain-hnsw/Cargo.toml
index 703160f7..5f5f9874 100644
--- a/crates/llm-chain-hnsw/Cargo.toml
+++ b/crates/llm-chain-hnsw/Cargo.toml
@@ -14,7 +14,7 @@ repository = "https://github.com/sobelio/llm-chain/"
 
 [dependencies]
 async-trait.workspace = true
-hnsw_rs = "0.1.19"
+hnsw_rs = "0.2"
 llm-chain = { path = "../llm-chain", version = "0.13.0", default-features = false }
 serde.workspace = true
 serde_json.workspace = true
diff --git a/crates/llm-chain-hnsw/examples/dump_load.rs b/crates/llm-chain-hnsw/examples/dump_load.rs
index 58204272..4922de1d 100644
--- a/crates/llm-chain-hnsw/examples/dump_load.rs
+++ b/crates/llm-chain-hnsw/examples/dump_load.rs
@@ -1,3 +1,5 @@
+use hnsw_rs::{hnswio::*, prelude::*};
+use std::path::PathBuf;
 use std::sync::Arc;
 
 use llm_chain::{
@@ -16,7 +18,7 @@ async fn main() {
     let hnsw_index_fn = "hnsw_index".to_string();
     let mut embeddings = llm_chain_openai::embeddings::Embeddings::default();
     let document_store = Arc::new(Mutex::new(InMemoryDocumentStore::<EmptyMetadata>::new()));
-    let mut hnsw_vs = HnswVectorStore::new(
+    let hnsw_vs = HnswVectorStore::new(
         HnswArgs::default(),
         Arc::new(embeddings),
         document_store.clone(),
@@ -56,12 +58,13 @@ async fn main() {
     // Load
     println!("Loading hnsw index from file");
     embeddings = llm_chain_openai::embeddings::Embeddings::default();
-    hnsw_vs = HnswVectorStore::load_from_file(
-        hnsw_index_fn,
-        Arc::new(embeddings),
-        document_store.clone(),
-    )
-    .unwrap();
+
+    let mut hnswio = HnswIo::new(PathBuf::from("."), hnsw_index_fn);
+    let hnsw_loaded = hnswio.load_hnsw::<f32, DistCosine>().unwrap();
+    let hnsw_vs =
+        HnswVectorStore::load_from_file(hnsw_loaded, Arc::new(embeddings), document_store.clone())
+            .unwrap();
+
     println!("Loaded!");
 
     let response = hnsw_vs
diff --git a/crates/llm-chain-hnsw/src/lib.rs b/crates/llm-chain-hnsw/src/lib.rs
index 44661011..87f9d54d 100644
--- a/crates/llm-chain-hnsw/src/lib.rs
+++ b/crates/llm-chain-hnsw/src/lib.rs
@@ -1,10 +1,7 @@
-use std::{
-    collections::HashMap, fs::OpenOptions, io::BufReader, marker::PhantomData, path::PathBuf,
-    sync::Arc,
-};
+use std::{collections::HashMap, marker::PhantomData, sync::Arc};
 
 use async_trait::async_trait;
-use hnsw_rs::{hnsw::Hnsw, hnswio::*, prelude::*};
+use hnsw_rs::{hnsw::Hnsw, prelude::*};
 use llm_chain::{
     document_stores::document_store::*,
     schema::Document,
@@ -32,19 +29,19 @@ impl Default for HnswArgs {
     }
 }
 
-pub struct HnswVectorStore<E, D, M>
+pub struct HnswVectorStore<'a, E, D, M>
 where
     E: Embeddings,
     D: DocumentStore<usize, M> + Send + Sync,
     M: Serialize + DeserializeOwned + Send + Sync,
 {
-    hnsw: Arc<Hnsw<f32, DistCosine>>,
+    hnsw: Arc<Hnsw<'a, f32, DistCosine>>,
     document_store: Arc<Mutex<D>>,
     embeddings: Arc<E>,
     _marker: PhantomData<M>,
 }
 
-impl<E, D, M> HnswVectorStore<E, D, M>
+impl<'a, E, D, M> HnswVectorStore<'a, E, D, M>
 where
     E: Embeddings,
     D: DocumentStore<usize, M> + Send + Sync,
@@ -69,47 +66,20 @@ where
     pub fn dump_to_file(
         &self,
         filename: String,
-    ) -> Result<i32, HnswVectorStoreError<E::Error, D::Error>> {
+    ) -> Result<String, HnswVectorStoreError<E::Error, D::Error>> {
         self.hnsw
             .file_dump(&filename)
-            .map_err(HnswVectorStoreError::FileDumpError)
+            .map_err(|e| HnswVectorStoreError::FileDumpError(e.to_string()))
     }
 
     pub fn load_from_file(
-        filename: String,
+        hnsw: Hnsw<'a, f32, DistCosine>,
         embeddings: Arc<E>,
         document_store: Arc<Mutex<D>>,
-    ) -> Result<Self, HnswVectorStoreError<E::Error, D::Error>> {
-        let graph_fn = format!("{}.hnsw.graph", &filename);
-        let graph_path = PathBuf::from(graph_fn);
-        let graph_file_res = OpenOptions::new().read(true).open(&graph_path);
-        if graph_file_res.is_err() {
-            return Err(HnswVectorStoreError::FileLoadError(format!(
-                "could not open file {:?}",
-                graph_path.as_os_str()
-            )));
-        }
-        let graph_file = graph_file_res.unwrap();
-        let data_fn = format!("{}.hnsw.data", &filename);
-        let data_path = PathBuf::from(data_fn);
-        let data_file_res = OpenOptions::new().read(true).open(&data_path);
-        if data_file_res.is_err() {
-            return Err(HnswVectorStoreError::FileLoadError(format!(
-                "could not open file {:?}",
-                data_path.as_os_str()
-            )));
-        }
-        let data_file = data_file_res.unwrap();
-
-        let mut graph_in = BufReader::new(graph_file);
-        let mut data_in = BufReader::new(data_file);
-
-        let hnsw_description = load_description(&mut graph_in).unwrap();
-        let hnsw_loaded: Hnsw<f32, DistCosine> =
-            load_hnsw(&mut graph_in, &hnsw_description, &mut data_in).unwrap();
-
+    ) -> Result<Self, HnswVectorStoreError<E::Error, D::Error>>
+where {
         Ok(HnswVectorStore {
-            hnsw: Arc::new(hnsw_loaded),
+            hnsw: Arc::new(hnsw),
             document_store,
             embeddings,
             _marker: Default::default(),
@@ -143,7 +113,7 @@ where
 }
 
 #[async_trait]
-impl<E, D, M> VectorStore<E, M> for HnswVectorStore<E, D, M>
+impl<'a, E, D, M> VectorStore<E, M> for HnswVectorStore<'a, E, D, M>
 where
     E: Embeddings + Send + Sync,
     D: DocumentStore<usize, M> + Send + Sync,
diff --git a/crates/llm-chain-llama-sys/README.md b/crates/llm-chain-llama-sys/README.md
index 66f56b64..fd5c31d4 100644
--- a/crates/llm-chain-llama-sys/README.md
+++ b/crates/llm-chain-llama-sys/README.md
@@ -12,3 +12,10 @@ use llama_sys::\*;
 ```
 
 Note that llama-sys provides a lower-level interface than llm-chain-llama, and may be more difficult to use. However, if you need fine-grained control over llama.cpp, llama-sys is the way to go.
+
+## Updating llama.cpp submodule
+To update the llama.cpp submodule, run the following command:
+
+```console
+$ git submodule update --remote --merge llama.cpp
+```
diff --git a/crates/llm-chain-llama-sys/build.rs b/crates/llm-chain-llama-sys/build.rs
index e85f682d..270200eb 100644
--- a/crates/llm-chain-llama-sys/build.rs
+++ b/crates/llm-chain-llama-sys/build.rs
@@ -49,6 +49,9 @@ fn main() {
                 let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
                 b.write_to_file(out_path.join("bindings.rs"))
                     .expect("Couldn't write bindings!");
+                let out_path = PathBuf::from("src");
+                b.write_to_file(out_path.join("bindings.rs"))
+                    .expect("Couldn't write binding to src directorys!");
             }
             Err(e) => {
                 println!("cargo:warning=Unable to generate bindings: {}", e);
@@ -85,7 +88,9 @@ fn main() {
         .arg("-DLLAMA_ALL_WARNINGS=OFF")
         .arg("-DLLAMA_ALL_WARNINGS_3RD_PARTY=OFF")
         .arg("-DLLAMA_BUILD_TESTS=OFF")
-        .arg("-DLLAMA_BUILD_EXAMPLES=OFF");
+        .arg("-DLLAMA_BUILD_EXAMPLES=OFF")
+        .arg("-DLLAMA_NO_METAL=ON")
+        .arg("-DLLAMA_METAL=OFF");
     // .arg("-DLLAMA_STATIC=ON")
     if cuda_enabled {
         // If CUDA feature is enabled, build with cuBlAS to enable GPU acceleration
diff --git a/crates/llm-chain-llama-sys/llama.cpp b/crates/llm-chain-llama-sys/llama.cpp
index 173d0e64..e4b76bbe 160000
--- a/crates/llm-chain-llama-sys/llama.cpp
+++ b/crates/llm-chain-llama-sys/llama.cpp
@@ -1 +1 @@
-Subproject commit 173d0e6419e8f8f3c1f4f13201b777f4c60629f3
+Subproject commit e4b76bbe316ee50fb17d9ac29e654c0edf830eba
diff --git a/crates/llm-chain-llama-sys/src/bindings.rs b/crates/llm-chain-llama-sys/src/bindings.rs
index eae6618f..2ce37af8 100644
--- a/crates/llm-chain-llama-sys/src/bindings.rs
+++ b/crates/llm-chain-llama-sys/src/bindings.rs
@@ -52,7 +52,7 @@ pub const __STDC_IEC_60559_COMPLEX__: u32 = 201404;
 pub const __STDC_ISO_10646__: u32 = 201706;
 pub const __GNU_LIBRARY__: u32 = 6;
 pub const __GLIBC__: u32 = 2;
-pub const __GLIBC_MINOR__: u32 = 37;
+pub const __GLIBC_MINOR__: u32 = 36;
 pub const _SYS_CDEFS_H: u32 = 1;
 pub const __glibc_c99_flexarr_available: u32 = 1;
 pub const __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI: u32 = 0;
@@ -152,30 +152,76 @@ pub const GGML_FILE_VERSION: u32 = 1;
 pub const GGML_QNT_VERSION: u32 = 2;
 pub const GGML_QNT_VERSION_FACTOR: u32 = 1000;
 pub const GGML_MAX_DIMS: u32 = 4;
-pub const GGML_MAX_NODES: u32 = 4096;
-pub const GGML_MAX_PARAMS: u32 = 256;
+pub const GGML_MAX_PARAMS: u32 = 1024;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
 pub const GGML_MAX_SRC: u32 = 6;
-pub const GGML_MAX_NAME: u32 = 48;
-pub const GGML_MAX_OP_PARAMS: u32 = 32;
+pub const GGML_MAX_NAME: u32 = 64;
+pub const GGML_MAX_OP_PARAMS: u32 = 64;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
+pub const GGML_DEFAULT_GRAPH_SIZE: u32 = 2048;
+pub const GGML_MEM_ALIGN: u32 = 16;
 pub const GGML_EXIT_SUCCESS: u32 = 0;
 pub const GGML_EXIT_ABORTED: u32 = 1;
-pub const GGML_GRAPH_HASHTABLE_SIZE: u32 = 8273;
-pub const GGML_CUDA_MAX_DEVICES: u32 = 16;
-pub const LLAMA_MAX_DEVICES: u32 = 16;
-pub const LLAMA_FILE_MAGIC_GGJT: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_GGLA: u32 = 1734831201;
-pub const LLAMA_FILE_MAGIC_GGMF: u32 = 1734831462;
-pub const LLAMA_FILE_MAGIC_GGML: u32 = 1734831468;
+pub const GGUF_MAGIC: &[u8; 5] = b"GGUF\0";
+pub const GGUF_VERSION: u32 = 3;
+pub const GGUF_DEFAULT_ALIGNMENT: u32 = 32;
+pub const GGML_N_TASKS_MAX: i32 = -1;
+pub const LLAMA_MAX_DEVICES: u32 = 1;
+pub const _STDIO_H: u32 = 1;
+pub const __GNUC_VA_LIST: u32 = 1;
+pub const _____fpos_t_defined: u32 = 1;
+pub const ____mbstate_t_defined: u32 = 1;
+pub const _____fpos64_t_defined: u32 = 1;
+pub const ____FILE_defined: u32 = 1;
+pub const __FILE_defined: u32 = 1;
+pub const __struct_FILE_defined: u32 = 1;
+pub const _IO_EOF_SEEN: u32 = 16;
+pub const _IO_ERR_SEEN: u32 = 32;
+pub const _IO_USER_LOCK: u32 = 32768;
+pub const __cookie_io_functions_t_defined: u32 = 1;
+pub const _IOFBF: u32 = 0;
+pub const _IOLBF: u32 = 1;
+pub const _IONBF: u32 = 2;
+pub const BUFSIZ: u32 = 8192;
+pub const EOF: i32 = -1;
+pub const SEEK_SET: u32 = 0;
+pub const SEEK_CUR: u32 = 1;
+pub const SEEK_END: u32 = 2;
+pub const SEEK_DATA: u32 = 3;
+pub const SEEK_HOLE: u32 = 4;
+pub const P_tmpdir: &[u8; 5] = b"/tmp\0";
+pub const _BITS_STDIO_LIM_H: u32 = 1;
+pub const L_tmpnam: u32 = 20;
+pub const TMP_MAX: u32 = 238328;
+pub const FILENAME_MAX: u32 = 4096;
+pub const L_ctermid: u32 = 9;
+pub const L_cuserid: u32 = 9;
+pub const FOPEN_MAX: u32 = 16;
+pub const _PRINTF_NAN_LEN_MAX: u32 = 4;
+pub const RENAME_NOREPLACE: u32 = 1;
+pub const RENAME_EXCHANGE: u32 = 2;
+pub const RENAME_WHITEOUT: u32 = 4;
+pub const __HAVE_FLOAT128: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT128: u32 = 0;
+pub const __HAVE_FLOAT64X: u32 = 1;
+pub const __HAVE_FLOAT64X_LONG_DOUBLE: u32 = 1;
+pub const __HAVE_FLOAT16: u32 = 0;
+pub const __HAVE_FLOAT32: u32 = 1;
+pub const __HAVE_FLOAT64: u32 = 1;
+pub const __HAVE_FLOAT32X: u32 = 1;
+pub const __HAVE_FLOAT128X: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT16: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT32: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT64: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT32X: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT64X: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT128X: u32 = 0;
+pub const __HAVE_FLOATN_NOT_TYPEDEF: u32 = 0;
+pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
+pub const LLAMA_MAX_RNG_STATE: u32 = 65536;
 pub const LLAMA_FILE_MAGIC_GGSN: u32 = 1734833006;
-pub const LLAMA_FILE_VERSION: u32 = 3;
-pub const LLAMA_FILE_MAGIC: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468;
 pub const LLAMA_SESSION_MAGIC: u32 = 1734833006;
-pub const LLAMA_SESSION_VERSION: u32 = 1;
-pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
-pub const LLAMA_DEFAULT_RMS_EPS: f64 = 0.000005;
+pub const LLAMA_SESSION_VERSION: u32 = 2;
 pub type __u_char = ::std::os::raw::c_uchar;
 pub type __u_short = ::std::os::raw::c_ushort;
 pub type __u_int = ::std::os::raw::c_uint;
@@ -366,10 +412,10 @@ pub const ggml_type_GGML_TYPE_I16: ggml_type = 17;
 pub const ggml_type_GGML_TYPE_I32: ggml_type = 18;
 pub const ggml_type_GGML_TYPE_COUNT: ggml_type = 19;
 pub type ggml_type = ::std::os::raw::c_uint;
-pub const ggml_backend_GGML_BACKEND_CPU: ggml_backend = 0;
-pub const ggml_backend_GGML_BACKEND_GPU: ggml_backend = 10;
-pub const ggml_backend_GGML_BACKEND_GPU_SPLIT: ggml_backend = 20;
-pub type ggml_backend = ::std::os::raw::c_uint;
+pub const ggml_backend_type_GGML_BACKEND_CPU: ggml_backend_type = 0;
+pub const ggml_backend_type_GGML_BACKEND_GPU: ggml_backend_type = 10;
+pub const ggml_backend_type_GGML_BACKEND_GPU_SPLIT: ggml_backend_type = 20;
+pub type ggml_backend_type = ::std::os::raw::c_uint;
 pub const ggml_ftype_GGML_FTYPE_UNKNOWN: ggml_ftype = -1;
 pub const ggml_ftype_GGML_FTYPE_ALL_F32: ggml_ftype = 0;
 pub const ggml_ftype_GGML_FTYPE_MOSTLY_F16: ggml_ftype = 1;
@@ -402,49 +448,58 @@ pub const ggml_op_GGML_OP_MEAN: ggml_op = 13;
 pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14;
 pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15;
 pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 17;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 20;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 21;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 22;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 23;
-pub const ggml_op_GGML_OP_SET: ggml_op = 24;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 25;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 26;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 27;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 28;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 29;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 30;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 31;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 32;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 33;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 34;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 35;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 36;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 37;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 38;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 39;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 40;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 41;
-pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 42;
-pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 43;
-pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 44;
-pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 45;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 46;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 47;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 48;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 49;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 50;
-pub const ggml_op_GGML_OP_UNARY: ggml_op = 51;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 52;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 53;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 54;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 55;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 56;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 57;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 58;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 59;
+pub const ggml_op_GGML_OP_CONCAT: ggml_op = 17;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 18;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 19;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 20;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 21;
+pub const ggml_op_GGML_OP_GROUP_NORM: ggml_op = 22;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 23;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 24;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 25;
+pub const ggml_op_GGML_OP_SET: ggml_op = 26;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 27;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 28;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 29;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 30;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 31;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 32;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 33;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 34;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 35;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 36;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 37;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 38;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 39;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 40;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 41;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 42;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 43;
+pub const ggml_op_GGML_OP_CONV_TRANSPOSE_1D: ggml_op = 44;
+pub const ggml_op_GGML_OP_IM2COL: ggml_op = 45;
+pub const ggml_op_GGML_OP_CONV_TRANSPOSE_2D: ggml_op = 46;
+pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 47;
+pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 48;
+pub const ggml_op_GGML_OP_UPSCALE: ggml_op = 49;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 50;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 51;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 52;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 53;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 54;
+pub const ggml_op_GGML_OP_GET_REL_POS: ggml_op = 55;
+pub const ggml_op_GGML_OP_ADD_REL_POS: ggml_op = 56;
+pub const ggml_op_GGML_OP_UNARY: ggml_op = 57;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 58;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 59;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1_F32: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2_F32: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3_F32: ggml_op = 62;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 68;
 pub type ggml_op = ::std::os::raw::c_uint;
 pub const ggml_unary_op_GGML_UNARY_OP_ABS: ggml_unary_op = 0;
 pub const ggml_unary_op_GGML_UNARY_OP_SGN: ggml_unary_op = 1;
@@ -456,11 +511,16 @@ pub const ggml_unary_op_GGML_UNARY_OP_RELU: ggml_unary_op = 6;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU: ggml_unary_op = 7;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU_QUICK: ggml_unary_op = 8;
 pub const ggml_unary_op_GGML_UNARY_OP_SILU: ggml_unary_op = 9;
+pub const ggml_unary_op_GGML_UNARY_OP_LEAKY: ggml_unary_op = 10;
 pub type ggml_unary_op = ::std::os::raw::c_uint;
 pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
 pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
 pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
 pub type ggml_object_type = ::std::os::raw::c_uint;
+pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
+pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
+pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
+pub type ggml_log_level = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
@@ -540,22 +600,25 @@ pub const GGML_OBJECT_SIZE: usize = 32;
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_tensor {
     pub type_: ggml_type,
-    pub backend: ggml_backend,
+    pub backend: ggml_backend_type,
+    pub buffer: *mut ggml_backend_buffer,
     pub n_dims: ::std::os::raw::c_int,
     pub ne: [i64; 4usize],
     pub nb: [usize; 4usize],
     pub op: ggml_op,
-    pub op_params: [i32; 8usize],
+    pub op_params: [i32; 16usize],
     pub is_param: bool,
     pub grad: *mut ggml_tensor,
     pub src: [*mut ggml_tensor; 6usize],
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
+    pub view_src: *mut ggml_tensor,
+    pub view_offs: usize,
     pub data: *mut ::std::os::raw::c_void,
-    pub name: [::std::os::raw::c_char; 48usize],
+    pub name: [::std::os::raw::c_char; 64usize],
     pub extra: *mut ::std::os::raw::c_void,
-    pub padding: [::std::os::raw::c_char; 4usize],
+    pub padding: [::std::os::raw::c_char; 12usize],
 }
 #[test]
 fn bindgen_test_layout_ggml_tensor() {
@@ -563,7 +626,7 @@ fn bindgen_test_layout_ggml_tensor() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_tensor>(),
-        272usize,
+        352usize,
         concat!("Size of: ", stringify!(ggml_tensor))
     );
     assert_eq!(
@@ -592,8 +655,18 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_dims) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).buffer) as usize - ptr as usize },
         8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(buffer)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_dims) as usize - ptr as usize },
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -603,7 +676,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).ne) as usize - ptr as usize },
-        16usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -613,7 +686,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nb) as usize - ptr as usize },
-        48usize,
+        56usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -623,7 +696,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).op) as usize - ptr as usize },
-        80usize,
+        88usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -633,7 +706,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).op_params) as usize - ptr as usize },
-        84usize,
+        92usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -643,7 +716,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize },
-        116usize,
+        156usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -653,7 +726,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).grad) as usize - ptr as usize },
-        120usize,
+        160usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -663,7 +736,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).src) as usize - ptr as usize },
-        128usize,
+        168usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -673,7 +746,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        176usize,
+        216usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -683,7 +756,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        184usize,
+        224usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -693,7 +766,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        192usize,
+        232usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -701,9 +774,29 @@ fn bindgen_test_layout_ggml_tensor() {
             stringify!(perf_time_us)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).view_src) as usize - ptr as usize },
+        240usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_src)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).view_offs) as usize - ptr as usize },
+        248usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_offs)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
-        200usize,
+        256usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -713,7 +806,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).name) as usize - ptr as usize },
-        208usize,
+        264usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -723,7 +816,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize },
-        256usize,
+        328usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -733,7 +826,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        264usize,
+        336usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -742,14 +835,13 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
 }
-pub const GGML_TENSOR_SIZE: usize = 272;
+pub const GGML_TENSOR_SIZE: usize = 352;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cplan {
     pub work_size: usize,
     pub work_data: *mut u8,
     pub n_threads: ::std::os::raw::c_int,
-    pub n_tasks: [::std::os::raw::c_int; 4096usize],
     pub abort_callback:
         ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void) -> bool>,
     pub abort_callback_data: *mut ::std::os::raw::c_void,
@@ -760,7 +852,7 @@ fn bindgen_test_layout_ggml_cplan() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_cplan>(),
-        16424usize,
+        40usize,
         concat!("Size of: ", stringify!(ggml_cplan))
     );
     assert_eq!(
@@ -799,45 +891,82 @@ fn bindgen_test_layout_ggml_cplan() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize },
-        20usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback) as usize - ptr as usize },
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cplan),
             "::",
-            stringify!(n_tasks)
+            stringify!(abort_callback)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback) as usize - ptr as usize },
-        16408usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback_data) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cplan),
             "::",
-            stringify!(abort_callback)
+            stringify!(abort_callback_data)
         )
     );
+}
+pub const ggml_cgraph_eval_order_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT: ggml_cgraph_eval_order = 0;
+pub const ggml_cgraph_eval_order_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT: ggml_cgraph_eval_order = 1;
+pub const ggml_cgraph_eval_order_GGML_CGRAPH_EVAL_ORDER_COUNT: ggml_cgraph_eval_order = 2;
+pub type ggml_cgraph_eval_order = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_hash_set {
+    pub size: usize,
+    pub keys: *mut *mut ggml_tensor,
+}
+#[test]
+fn bindgen_test_layout_ggml_hash_set() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_hash_set> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback_data) as usize - ptr as usize },
-        16416usize,
+        ::std::mem::size_of::<ggml_hash_set>(),
+        16usize,
+        concat!("Size of: ", stringify!(ggml_hash_set))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_hash_set>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_hash_set))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        0usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_cplan),
+            stringify!(ggml_hash_set),
             "::",
-            stringify!(abort_callback_data)
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).keys) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_hash_set),
+            "::",
+            stringify!(keys)
         )
     );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cgraph {
+    pub size: ::std::os::raw::c_int,
     pub n_nodes: ::std::os::raw::c_int,
     pub n_leafs: ::std::os::raw::c_int,
-    pub nodes: [*mut ggml_tensor; 4096usize],
-    pub grads: [*mut ggml_tensor; 4096usize],
-    pub leafs: [*mut ggml_tensor; 4096usize],
-    pub visited_hash_table: [*mut ::std::os::raw::c_void; 8273usize],
+    pub nodes: *mut *mut ggml_tensor,
+    pub grads: *mut *mut ggml_tensor,
+    pub leafs: *mut *mut ggml_tensor,
+    pub visited_hash_table: ggml_hash_set,
+    pub order: ggml_cgraph_eval_order,
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
@@ -848,7 +977,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_cgraph>(),
-        164520usize,
+        80usize,
         concat!("Size of: ", stringify!(ggml_cgraph))
     );
     assert_eq!(
@@ -857,8 +986,18 @@ fn bindgen_test_layout_ggml_cgraph() {
         concat!("Alignment of ", stringify!(ggml_cgraph))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
         0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        4usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -868,7 +1007,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize },
-        4usize,
+        8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -878,7 +1017,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nodes) as usize - ptr as usize },
-        8usize,
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -888,7 +1027,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).grads) as usize - ptr as usize },
-        32776usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -898,7 +1037,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).leafs) as usize - ptr as usize },
-        65544usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -908,7 +1047,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).visited_hash_table) as usize - ptr as usize },
-        98312usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -916,9 +1055,19 @@ fn bindgen_test_layout_ggml_cgraph() {
             stringify!(visited_hash_table)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).order) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(order)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        164496usize,
+        60usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -928,7 +1077,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        164504usize,
+        64usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -938,7 +1087,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        164512usize,
+        72usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -947,7 +1096,6 @@ fn bindgen_test_layout_ggml_cgraph() {
         )
     );
 }
-pub const GGML_GRAPH_SIZE: usize = 164520;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_scratch {
@@ -1145,6 +1293,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cycles_per_ms() -> i64;
 }
+extern "C" {
+    pub fn ggml_print_backtrace();
+}
 extern "C" {
     pub fn ggml_numa_init();
 }
@@ -1166,6 +1317,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_nbytes(tensor: *const ggml_tensor) -> usize;
 }
+extern "C" {
+    pub fn ggml_nbytes_pad(tensor: *const ggml_tensor) -> usize;
+}
 extern "C" {
     pub fn ggml_nbytes_split(
         tensor: *const ggml_tensor,
@@ -1208,6 +1362,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_is_permuted(tensor: *const ggml_tensor) -> bool;
 }
+extern "C" {
+    pub fn ggml_are_same_shape(t0: *const ggml_tensor, t1: *const ggml_tensor) -> bool;
+}
 extern "C" {
     pub fn ggml_tensor_overhead() -> usize;
 }
@@ -1290,7 +1447,16 @@ extern "C" {
     pub fn ggml_dup_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_first_tensor(ctx: *mut ggml_context) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_next_tensor(
+        ctx: *mut ggml_context,
+        tensor: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_get_tensor(
@@ -1307,18 +1473,66 @@ extern "C" {
 extern "C" {
     pub fn ggml_set_f32(tensor: *mut ggml_tensor, value: f32) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_unravel_index(
+        tensor: *const ggml_tensor,
+        i: i64,
+        i0: *mut i64,
+        i1: *mut i64,
+        i2: *mut i64,
+        i3: *mut i64,
+    );
+}
 extern "C" {
     pub fn ggml_get_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> i32;
 }
 extern "C" {
     pub fn ggml_set_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: i32);
 }
+extern "C" {
+    pub fn ggml_get_i32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+    ) -> i32;
+}
+extern "C" {
+    pub fn ggml_set_i32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+        value: i32,
+    );
+}
 extern "C" {
     pub fn ggml_get_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> f32;
 }
 extern "C" {
     pub fn ggml_set_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: f32);
 }
+extern "C" {
+    pub fn ggml_get_f32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+    ) -> f32;
+}
+extern "C" {
+    pub fn ggml_set_f32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+        value: f32,
+    );
+}
 extern "C" {
     pub fn ggml_get_data(tensor: *const ggml_tensor) -> *mut ::std::os::raw::c_void;
 }
@@ -1364,6 +1578,14 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_add_cast(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        type_: ggml_type,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_add1(
         ctx: *mut ggml_context,
@@ -1486,6 +1708,13 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_concat(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1525,6 +1754,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_leaky(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_relu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1555,10 +1787,14 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        eps: f32,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32)
@@ -1571,11 +1807,26 @@ extern "C" {
         eps: f32,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_group_norm(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_group_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rms_norm_back(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        eps: f32,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1682,6 +1933,36 @@ extern "C" {
 extern "C" {
     pub fn ggml_cont_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_cont_1d(ctx: *mut ggml_context, a: *mut ggml_tensor, ne0: i64) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont_3d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont_4d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+        ne3: i64,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_reshape(
         ctx: *mut ggml_context,
@@ -1832,6 +2113,14 @@ extern "C" {
 extern "C" {
     pub fn ggml_soft_max_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_soft_max_ext(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        mask: *mut ggml_tensor,
+        scale: f32,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_soft_max_back(
         ctx: *mut ggml_context,
@@ -1850,7 +2139,7 @@ extern "C" {
     pub fn ggml_rope(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
@@ -1860,7 +2149,7 @@ extern "C" {
     pub fn ggml_rope_inplace(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
@@ -1870,34 +2159,73 @@ extern "C" {
     pub fn ggml_rope_custom(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
         freq_base: f32,
         freq_scale: f32,
+        ext_factor: f32,
+        attn_factor: f32,
+        beta_fast: f32,
+        beta_slow: f32,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rope_custom_inplace(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
         freq_base: f32,
         freq_scale: f32,
+        ext_factor: f32,
+        attn_factor: f32,
+        beta_fast: f32,
+        beta_slow: f32,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_rope_yarn_corr_dims(
+        n_dims: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        beta_fast: f32,
+        beta_slow: f32,
+        dims: *mut f32,
+    );
+}
+extern "C" {
+    pub fn ggml_rope_xpos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        n_dims: ::std::os::raw::c_int,
+        base: f32,
+        down: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rope_back(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        freq_scale: f32,
+        ext_factor: f32,
+        attn_factor: f32,
+        beta_fast: f32,
+        beta_slow: f32,
+        xpos_base: f32,
+        xpos_down: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1918,26 +2246,27 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d(
+    pub fn ggml_im2col(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
+        s1: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
+        p1: ::std::os::raw::c_int,
         d0: ::std::os::raw::c_int,
+        d1: ::std::os::raw::c_int,
+        is_2D: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_2d(
+    pub fn ggml_conv_1d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
-        s1: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
-        p1: ::std::os::raw::c_int,
         d0: ::std::os::raw::c_int,
-        d1: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1949,31 +2278,83 @@ extern "C" {
         d: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
-pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
-pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
-pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
-pub type ggml_op_pool = ::std::os::raw::c_uint;
 extern "C" {
-    pub fn ggml_pool_1d(
+    pub fn ggml_conv_transpose_1d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        op: ggml_op_pool,
-        k0: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
+        d0: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_pool_2d(
+    pub fn ggml_conv_2d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        op: ggml_op_pool,
-        k0: ::std::os::raw::c_int,
-        k1: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
         s1: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
         p1: ::std::os::raw::c_int,
+        d0: ::std::os::raw::c_int,
+        d1: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_sk_p0(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_s1_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_transpose_2d_p0(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        stride: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
+pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
+pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
+pub type ggml_op_pool = ::std::os::raw::c_uint;
+extern "C" {
+    pub fn ggml_pool_1d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_op_pool,
+        k0: ::std::os::raw::c_int,
+        s0: ::std::os::raw::c_int,
+        p0: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_pool_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_op_pool,
+        k0: ::std::os::raw::c_int,
+        k1: ::std::os::raw::c_int,
+        s0: ::std::os::raw::c_int,
+        s1: ::std::os::raw::c_int,
+        p0: f32,
+        p1: f32,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_upscale(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        scale_factor: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -2021,6 +2402,44 @@ extern "C" {
         w: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_unary(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_unary_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        qh: ::std::os::raw::c_int,
+        kh: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 pub type ggml_unary_op_f32_t = ::std::option::Option<
     unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
 >;
@@ -2049,20 +2468,6 @@ pub type ggml_custom3_op_f32_t = ::std::option::Option<
         arg4: *const ggml_tensor,
     ),
 >;
-extern "C" {
-    pub fn ggml_unary(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_unary_inplace(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
 extern "C" {
     pub fn ggml_map_unary_f32(
         ctx: *mut ggml_context,
@@ -2141,6 +2546,96 @@ extern "C" {
         fun: ggml_custom3_op_f32_t,
     ) -> *mut ggml_tensor;
 }
+pub type ggml_custom1_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom2_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom3_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        c: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+extern "C" {
+    pub fn ggml_map_custom1(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom1_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_cross_entropy_loss(
         ctx: *mut ggml_context,
@@ -2163,27 +2658,49 @@ extern "C" {
     pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
 }
 extern "C" {
-    pub fn ggml_build_forward(tensor: *mut ggml_tensor) -> ggml_cgraph;
-}
-extern "C" {
-    pub fn ggml_build_backward(
+    pub fn ggml_build_backward_expand(
         ctx: *mut ggml_context,
         gf: *mut ggml_cgraph,
+        gb: *mut ggml_cgraph,
         keep: bool,
-    ) -> ggml_cgraph;
+    );
 }
 extern "C" {
     pub fn ggml_new_graph(ctx: *mut ggml_context) -> *mut ggml_cgraph;
 }
 extern "C" {
-    pub fn ggml_build_forward_ctx(
+    pub fn ggml_new_graph_custom(
         ctx: *mut ggml_context,
-        tensor: *mut ggml_tensor,
+        size: usize,
+        grads: bool,
     ) -> *mut ggml_cgraph;
 }
+extern "C" {
+    pub fn ggml_graph_dup(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph) -> *mut ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_graph_view(
+        ctx: *mut ggml_context,
+        cgraph: *mut ggml_cgraph,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+    ) -> *mut ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_graph_cpy(src: *mut ggml_cgraph, dst: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_clear(cgraph: *mut ggml_cgraph);
+}
 extern "C" {
     pub fn ggml_graph_overhead() -> usize;
 }
+extern "C" {
+    pub fn ggml_graph_overhead_custom(size: usize, grads: bool) -> usize;
+}
 extern "C" {
     pub fn ggml_graph_plan(
         cgraph: *mut ggml_cgraph,
@@ -2196,9 +2713,6 @@ extern "C" {
         cplan: *mut ggml_cplan,
     ) -> ::std::os::raw::c_int;
 }
-extern "C" {
-    pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
-}
 extern "C" {
     pub fn ggml_graph_compute_with_ctx(
         ctx: *mut ggml_context,
@@ -2220,7 +2734,7 @@ extern "C" {
         fname: *const ::std::os::raw::c_char,
         ctx_data: *mut *mut ggml_context,
         ctx_eval: *mut *mut ggml_context,
-    ) -> ggml_cgraph;
+    ) -> *mut ggml_cgraph;
 }
 extern "C" {
     pub fn ggml_graph_print(cgraph: *const ggml_cgraph);
@@ -2232,6 +2746,16 @@ extern "C" {
         filename: *const ::std::os::raw::c_char,
     );
 }
+extern "C" {
+    pub fn ggml_build_backward_gradient_checkpointing(
+        ctx: *mut ggml_context,
+        gf: *mut ggml_cgraph,
+        gb: *mut ggml_cgraph,
+        gb_tmp: *mut ggml_cgraph,
+        checkpoints: *mut *mut ggml_tensor,
+        n_checkpoints: ::std::os::raw::c_int,
+    );
+}
 pub const ggml_opt_type_GGML_OPT_ADAM: ggml_opt_type = 0;
 pub const ggml_opt_type_GGML_OPT_LBFGS: ggml_opt_type = 1;
 pub type ggml_opt_type = ::std::os::raw::c_uint;
@@ -2245,22 +2769,40 @@ pub const ggml_opt_result_GGML_OPT_DID_NOT_CONVERGE: ggml_opt_result = 1;
 pub const ggml_opt_result_GGML_OPT_NO_CONTEXT: ggml_opt_result = 2;
 pub const ggml_opt_result_GGML_OPT_INVALID_WOLFE: ggml_opt_result = 3;
 pub const ggml_opt_result_GGML_OPT_FAIL: ggml_opt_result = 4;
+pub const ggml_opt_result_GGML_OPT_CANCEL: ggml_opt_result = 5;
 pub const ggml_opt_result_GGML_LINESEARCH_FAIL: ggml_opt_result = -128;
 pub const ggml_opt_result_GGML_LINESEARCH_MINIMUM_STEP: ggml_opt_result = -127;
 pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_STEP: ggml_opt_result = -126;
 pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_ITERATIONS: ggml_opt_result = -125;
 pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result = -124;
 pub type ggml_opt_result = ::std::os::raw::c_int;
+pub type ggml_opt_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        data: *mut ::std::os::raw::c_void,
+        accum_step: ::std::os::raw::c_int,
+        sched: *mut f32,
+        cancel: *mut bool,
+    ),
+>;
+pub type ggml_log_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        level: ggml_log_level,
+        text: *const ::std::os::raw::c_char,
+        user_data: *mut ::std::os::raw::c_void,
+    ),
+>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_params {
     pub type_: ggml_opt_type,
+    pub graph_size: usize,
     pub n_threads: ::std::os::raw::c_int,
     pub past: ::std::os::raw::c_int,
     pub delta: f32,
     pub max_no_improvement: ::std::os::raw::c_int,
     pub print_forward_graph: bool,
     pub print_backward_graph: bool,
+    pub n_gradient_accumulation: ::std::os::raw::c_int,
     pub adam: ggml_opt_params__bindgen_ty_1,
     pub lbfgs: ggml_opt_params__bindgen_ty_2,
 }
@@ -2270,12 +2812,14 @@ pub struct ggml_opt_params__bindgen_ty_1 {
     pub n_iter: ::std::os::raw::c_int,
     pub sched: f32,
     pub decay: f32,
+    pub decay_min_ndim: ::std::os::raw::c_int,
     pub alpha: f32,
     pub beta1: f32,
     pub beta2: f32,
     pub eps: f32,
     pub eps_f: f32,
     pub eps_g: f32,
+    pub gclip: f32,
 }
 #[test]
 fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
@@ -2284,7 +2828,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params__bindgen_ty_1>(),
-        36usize,
+        44usize,
         concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_1))
     );
     assert_eq!(
@@ -2323,8 +2867,18 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).decay_min_ndim) as usize - ptr as usize },
         12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(decay_min_ndim)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2334,7 +2888,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta1) as usize - ptr as usize },
-        16usize,
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2344,7 +2898,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta2) as usize - ptr as usize },
-        20usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2354,7 +2908,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
-        24usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2364,7 +2918,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_f) as usize - ptr as usize },
-        28usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2374,7 +2928,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_g) as usize - ptr as usize },
-        32usize,
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2382,6 +2936,16 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
             stringify!(eps_g)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gclip) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(gclip)
+        )
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -2508,12 +3072,12 @@ fn bindgen_test_layout_ggml_opt_params() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params>(),
-        96usize,
+        120usize,
         concat!("Size of: ", stringify!(ggml_opt_params))
     );
     assert_eq!(
         ::std::mem::align_of::<ggml_opt_params>(),
-        4usize,
+        8usize,
         concat!("Alignment of ", stringify!(ggml_opt_params))
     );
     assert_eq!(
@@ -2526,9 +3090,19 @@ fn bindgen_test_layout_ggml_opt_params() {
             stringify!(type_)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).graph_size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(graph_size)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
-        4usize,
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2538,7 +3112,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).past) as usize - ptr as usize },
-        8usize,
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2548,7 +3122,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).delta) as usize - ptr as usize },
-        12usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2558,7 +3132,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).max_no_improvement) as usize - ptr as usize },
-        16usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2568,7 +3142,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).print_forward_graph) as usize - ptr as usize },
-        20usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2578,7 +3152,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).print_backward_graph) as usize - ptr as usize },
-        21usize,
+        33usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2586,9 +3160,19 @@ fn bindgen_test_layout_ggml_opt_params() {
             stringify!(print_backward_graph)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_gradient_accumulation) as usize - ptr as usize },
+        36usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(n_gradient_accumulation)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        24usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2598,7 +3182,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        60usize,
+        84usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2615,19 +3199,17 @@ pub struct ggml_opt_context {
     pub iter: ::std::os::raw::c_int,
     pub nx: i64,
     pub just_initialized: bool,
+    pub loss_before: f32,
+    pub loss_after: f32,
     pub adam: ggml_opt_context__bindgen_ty_1,
     pub lbfgs: ggml_opt_context__bindgen_ty_2,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_context__bindgen_ty_1 {
-    pub x: *mut ggml_tensor,
-    pub g1: *mut ggml_tensor,
-    pub g2: *mut ggml_tensor,
+    pub g: *mut ggml_tensor,
     pub m: *mut ggml_tensor,
     pub v: *mut ggml_tensor,
-    pub mh: *mut ggml_tensor,
-    pub vh: *mut ggml_tensor,
     pub pf: *mut ggml_tensor,
     pub fx_best: f32,
     pub fx_prev: f32,
@@ -2640,7 +3222,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_context__bindgen_ty_1>(),
-        80usize,
+        48usize,
         concat!("Size of: ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
@@ -2649,78 +3231,38 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
         concat!("Alignment of ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).x) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).g) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(x)
+            stringify!(g)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g1) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g1)
+            stringify!(m)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g2) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g2)
+            stringify!(v)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
         24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(m)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(v)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mh) as usize - ptr as usize },
-        40usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(mh)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vh) as usize - ptr as usize },
-        48usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(vh)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
-        56usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2730,7 +3272,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).fx_best) as usize - ptr as usize },
-        64usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2740,7 +3282,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).fx_prev) as usize - ptr as usize },
-        68usize,
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2750,7 +3292,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_no_improvement) as usize - ptr as usize },
-        72usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2991,7 +3533,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).iter) as usize - ptr as usize },
-        104usize,
+        128usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3001,7 +3543,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nx) as usize - ptr as usize },
-        112usize,
+        136usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3011,7 +3553,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).just_initialized) as usize - ptr as usize },
-        120usize,
+        144usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3019,9 +3561,29 @@ fn bindgen_test_layout_ggml_opt_context() {
             stringify!(just_initialized)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_before) as usize - ptr as usize },
+        148usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_before)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_after) as usize - ptr as usize },
+        152usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_after)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        128usize,
+        160usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3072,6 +3634,8 @@ extern "C" {
         f: *mut ggml_tensor,
         gf: *mut ggml_cgraph,
         gb: *mut ggml_cgraph,
+        callback: ggml_opt_callback,
+        callback_data: *mut ::std::os::raw::c_void,
     ) -> ggml_opt_result;
 }
 extern "C" {
@@ -3120,267 +3684,1781 @@ extern "C" {
     ) -> usize;
 }
 extern "C" {
-    pub fn ggml_quantize_chunk(
-        type_: ggml_type,
+    pub fn ggml_quantize_q2_K(
         src: *const f32,
         dst: *mut ::std::os::raw::c_void,
-        start: ::std::os::raw::c_int,
         n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
         hist: *mut i64,
     ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx2() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512_vbmi() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512_vnni() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_fma() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_neon() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_fp16_va() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_wasm_simd() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_blas() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q3_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_cublas() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q4_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_clblast() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q5_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_gpublas() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q6_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_chunk(
+        type_: ggml_type,
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        start: ::std::os::raw::c_int,
+        n: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
-extern "C" {
-    pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
+pub const gguf_type_GGUF_TYPE_UINT8: gguf_type = 0;
+pub const gguf_type_GGUF_TYPE_INT8: gguf_type = 1;
+pub const gguf_type_GGUF_TYPE_UINT16: gguf_type = 2;
+pub const gguf_type_GGUF_TYPE_INT16: gguf_type = 3;
+pub const gguf_type_GGUF_TYPE_UINT32: gguf_type = 4;
+pub const gguf_type_GGUF_TYPE_INT32: gguf_type = 5;
+pub const gguf_type_GGUF_TYPE_FLOAT32: gguf_type = 6;
+pub const gguf_type_GGUF_TYPE_BOOL: gguf_type = 7;
+pub const gguf_type_GGUF_TYPE_STRING: gguf_type = 8;
+pub const gguf_type_GGUF_TYPE_ARRAY: gguf_type = 9;
+pub const gguf_type_GGUF_TYPE_UINT64: gguf_type = 10;
+pub const gguf_type_GGUF_TYPE_INT64: gguf_type = 11;
+pub const gguf_type_GGUF_TYPE_FLOAT64: gguf_type = 12;
+pub const gguf_type_GGUF_TYPE_COUNT: gguf_type = 13;
+pub type gguf_type = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct gguf_context {
+    _unused: [u8; 0],
 }
-pub type ggml_to_float_t = ::std::option::Option<
-    unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
->;
-pub type ggml_from_float_t = ::std::option::Option<
-    unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
->;
-pub type ggml_vec_dot_t = ::std::option::Option<
-    unsafe extern "C" fn(
-        n: ::std::os::raw::c_int,
-        s: *mut f32,
-        x: *const ::std::os::raw::c_void,
-        y: *const ::std::os::raw::c_void,
-    ),
->;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct ggml_type_traits_t {
-    pub to_float: ggml_to_float_t,
-    pub from_float: ggml_from_float_t,
-    pub from_float_reference: ggml_from_float_t,
-    pub vec_dot: ggml_vec_dot_t,
-    pub vec_dot_type: ggml_type,
+pub struct gguf_init_params {
+    pub no_alloc: bool,
+    pub ctx: *mut *mut ggml_context,
 }
 #[test]
-fn bindgen_test_layout_ggml_type_traits_t() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_type_traits_t> = ::std::mem::MaybeUninit::uninit();
+fn bindgen_test_layout_gguf_init_params() {
+    const UNINIT: ::std::mem::MaybeUninit<gguf_init_params> = ::std::mem::MaybeUninit::uninit();
     let ptr = UNINIT.as_ptr();
     assert_eq!(
-        ::std::mem::size_of::<ggml_type_traits_t>(),
-        40usize,
-        concat!("Size of: ", stringify!(ggml_type_traits_t))
+        ::std::mem::size_of::<gguf_init_params>(),
+        16usize,
+        concat!("Size of: ", stringify!(gguf_init_params))
     );
     assert_eq!(
-        ::std::mem::align_of::<ggml_type_traits_t>(),
+        ::std::mem::align_of::<gguf_init_params>(),
         8usize,
-        concat!("Alignment of ", stringify!(ggml_type_traits_t))
+        concat!("Alignment of ", stringify!(gguf_init_params))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).no_alloc) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_type_traits_t),
+            stringify!(gguf_init_params),
             "::",
-            stringify!(to_float)
+            stringify!(no_alloc)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).ctx) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_type_traits_t),
-            "::",
-            stringify!(from_float)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_type_traits_t),
-            "::",
-            stringify!(from_float_reference)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_type_traits_t),
-            "::",
-            stringify!(vec_dot)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_type_traits_t),
+            stringify!(gguf_init_params),
             "::",
-            stringify!(vec_dot_type)
+            stringify!(ctx)
         )
     );
 }
 extern "C" {
-    pub fn ggml_internal_get_type_traits(i: ggml_type) -> ggml_type_traits_t;
+    pub fn gguf_init_empty() -> *mut gguf_context;
 }
 extern "C" {
-    pub fn ggml_init_cublas();
+    pub fn gguf_init_from_file(
+        fname: *const ::std::os::raw::c_char,
+        params: gguf_init_params,
+    ) -> *mut gguf_context;
 }
 extern "C" {
-    pub fn ggml_cuda_set_tensor_split(tensor_split: *const f32);
+    pub fn gguf_free(ctx: *mut gguf_context);
 }
 extern "C" {
-    pub fn ggml_cuda_mul(src0: *const ggml_tensor, src1: *const ggml_tensor, dst: *mut ggml_tensor);
+    pub fn gguf_type_name(type_: gguf_type) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn ggml_cuda_can_mul_mat(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-    ) -> bool;
+    pub fn gguf_get_version(ctx: *const gguf_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_cuda_mul_mat_get_wsize(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-    ) -> usize;
+    pub fn gguf_get_alignment(ctx: *const gguf_context) -> usize;
 }
 extern "C" {
-    pub fn ggml_cuda_mul_mat(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-        wdata: *mut ::std::os::raw::c_void,
-        wsize: usize,
-    );
+    pub fn gguf_get_data_offset(ctx: *const gguf_context) -> usize;
 }
 extern "C" {
-    pub fn ggml_cuda_host_malloc(size: usize) -> *mut ::std::os::raw::c_void;
+    pub fn gguf_get_data(ctx: *const gguf_context) -> *mut ::std::os::raw::c_void;
 }
 extern "C" {
-    pub fn ggml_cuda_host_free(ptr: *mut ::std::os::raw::c_void);
+    pub fn gguf_get_n_kv(ctx: *const gguf_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_cuda_transform_tensor(data: *mut ::std::os::raw::c_void, tensor: *mut ggml_tensor);
+    pub fn gguf_find_key(
+        ctx: *const gguf_context,
+        key: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_cuda_free_data(tensor: *mut ggml_tensor);
+    pub fn gguf_get_key(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn ggml_cuda_assign_buffers(tensor: *mut ggml_tensor);
+    pub fn gguf_get_kv_type(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> gguf_type;
 }
 extern "C" {
-    pub fn ggml_cuda_assign_buffers_no_scratch(tensor: *mut ggml_tensor);
+    pub fn gguf_get_arr_type(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> gguf_type;
 }
 extern "C" {
-    pub fn ggml_cuda_assign_buffers_force_inplace(tensor: *mut ggml_tensor);
+    pub fn gguf_get_val_u8(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u8;
 }
 extern "C" {
-    pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
+    pub fn gguf_get_val_i8(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i8;
 }
 extern "C" {
-    pub fn ggml_cuda_set_mul_mat_q(mul_mat_q: bool);
+    pub fn gguf_get_val_u16(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u16;
 }
 extern "C" {
-    pub fn ggml_cuda_set_scratch_size(scratch_size: usize);
+    pub fn gguf_get_val_i16(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i16;
 }
 extern "C" {
-    pub fn ggml_cuda_free_scratch();
+    pub fn gguf_get_val_u32(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u32;
 }
 extern "C" {
-    pub fn ggml_cuda_compute_forward(
-        params: *mut ggml_compute_params,
-        tensor: *mut ggml_tensor,
-    ) -> bool;
+    pub fn gguf_get_val_i32(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i32;
 }
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct llama_model {
-    _unused: [u8; 0],
+extern "C" {
+    pub fn gguf_get_val_f32(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> f32;
 }
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct llama_context {
-    _unused: [u8; 0],
+extern "C" {
+    pub fn gguf_get_val_u64(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u64;
 }
-pub type llama_token = ::std::os::raw::c_int;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct llama_token_data {
-    pub id: llama_token,
-    pub logit: f32,
-    pub p: f32,
+extern "C" {
+    pub fn gguf_get_val_i64(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i64;
 }
-#[test]
-fn bindgen_test_layout_llama_token_data() {
-    const UNINIT: ::std::mem::MaybeUninit<llama_token_data> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<llama_token_data>(),
-        12usize,
-        concat!("Size of: ", stringify!(llama_token_data))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<llama_token_data>(),
-        4usize,
-        concat!("Alignment of ", stringify!(llama_token_data))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).id) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_token_data),
-            "::",
+extern "C" {
+    pub fn gguf_get_val_f64(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> f64;
+}
+extern "C" {
+    pub fn gguf_get_val_bool(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> bool;
+}
+extern "C" {
+    pub fn gguf_get_val_str(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_val_data(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_arr_n(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_arr_data(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_arr_str(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_n_tensors(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_find_tensor(
+        ctx: *const gguf_context,
+        name: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_tensor_offset(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_tensor_name(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_set_val_u8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u8);
+}
+extern "C" {
+    pub fn gguf_set_val_i8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i8);
+}
+extern "C" {
+    pub fn gguf_set_val_u16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u16);
+}
+extern "C" {
+    pub fn gguf_set_val_i16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i16);
+}
+extern "C" {
+    pub fn gguf_set_val_u32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u32);
+}
+extern "C" {
+    pub fn gguf_set_val_i32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i32);
+}
+extern "C" {
+    pub fn gguf_set_val_f32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f32);
+}
+extern "C" {
+    pub fn gguf_set_val_u64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u64);
+}
+extern "C" {
+    pub fn gguf_set_val_i64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i64);
+}
+extern "C" {
+    pub fn gguf_set_val_f64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f64);
+}
+extern "C" {
+    pub fn gguf_set_val_bool(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: bool);
+}
+extern "C" {
+    pub fn gguf_set_val_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        val: *const ::std::os::raw::c_char,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_data(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        type_: gguf_type,
+        data: *const ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        data: *mut *const ::std::os::raw::c_char,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_kv(ctx: *mut gguf_context, src: *mut gguf_context);
+}
+extern "C" {
+    pub fn gguf_add_tensor(ctx: *mut gguf_context, tensor: *const ggml_tensor);
+}
+extern "C" {
+    pub fn gguf_set_tensor_type(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        type_: ggml_type,
+    );
+}
+extern "C" {
+    pub fn gguf_set_tensor_data(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        data: *const ::std::os::raw::c_void,
+        size: usize,
+    );
+}
+extern "C" {
+    pub fn gguf_write_to_file(
+        ctx: *const gguf_context,
+        fname: *const ::std::os::raw::c_char,
+        only_meta: bool,
+    );
+}
+extern "C" {
+    pub fn gguf_get_meta_size(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_meta_data(ctx: *const gguf_context, data: *mut ::std::os::raw::c_void);
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx2() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512_vbmi() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512_vnni() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_fma() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_neon() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_metal() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_fp16_va() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_wasm_simd() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_blas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_cublas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_clblast() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_gpublas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_ssse3() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
+}
+pub type ggml_to_float_t = ::std::option::Option<
+    unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
+>;
+pub type ggml_from_float_t = ::std::option::Option<
+    unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
+>;
+pub type ggml_vec_dot_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        n: ::std::os::raw::c_int,
+        s: *mut f32,
+        x: *const ::std::os::raw::c_void,
+        y: *const ::std::os::raw::c_void,
+    ),
+>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_type_traits_t {
+    pub type_name: *const ::std::os::raw::c_char,
+    pub blck_size: ::std::os::raw::c_int,
+    pub type_size: usize,
+    pub is_quantized: bool,
+    pub to_float: ggml_to_float_t,
+    pub from_float: ggml_from_float_t,
+    pub from_float_reference: ggml_from_float_t,
+    pub vec_dot: ggml_vec_dot_t,
+    pub vec_dot_type: ggml_type,
+}
+#[test]
+fn bindgen_test_layout_ggml_type_traits_t() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_type_traits_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_type_traits_t>(),
+        72usize,
+        concat!("Size of: ", stringify!(ggml_type_traits_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_type_traits_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_type_traits_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_name) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_name)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).blck_size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(blck_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).is_quantized) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(is_quantized)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(to_float)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(from_float)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
+        48usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(from_float_reference)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(vec_dot)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
+        64usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(vec_dot_type)
+        )
+    );
+}
+extern "C" {
+    pub fn ggml_internal_get_type_traits(type_: ggml_type) -> ggml_type_traits_t;
+}
+pub type va_list = __builtin_va_list;
+pub type __gnuc_va_list = __builtin_va_list;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct __mbstate_t {
+    pub __count: ::std::os::raw::c_int,
+    pub __value: __mbstate_t__bindgen_ty_1,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union __mbstate_t__bindgen_ty_1 {
+    pub __wch: ::std::os::raw::c_uint,
+    pub __wchb: [::std::os::raw::c_char; 4usize],
+}
+#[test]
+fn bindgen_test_layout___mbstate_t__bindgen_ty_1() {
+    const UNINIT: ::std::mem::MaybeUninit<__mbstate_t__bindgen_ty_1> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<__mbstate_t__bindgen_ty_1>(),
+        4usize,
+        concat!("Size of: ", stringify!(__mbstate_t__bindgen_ty_1))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<__mbstate_t__bindgen_ty_1>(),
+        4usize,
+        concat!("Alignment of ", stringify!(__mbstate_t__bindgen_ty_1))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__wch) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t__bindgen_ty_1),
+            "::",
+            stringify!(__wch)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__wchb) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t__bindgen_ty_1),
+            "::",
+            stringify!(__wchb)
+        )
+    );
+}
+#[test]
+fn bindgen_test_layout___mbstate_t() {
+    const UNINIT: ::std::mem::MaybeUninit<__mbstate_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<__mbstate_t>(),
+        8usize,
+        concat!("Size of: ", stringify!(__mbstate_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<__mbstate_t>(),
+        4usize,
+        concat!("Alignment of ", stringify!(__mbstate_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__count) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t),
+            "::",
+            stringify!(__count)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__value) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t),
+            "::",
+            stringify!(__value)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct _G_fpos_t {
+    pub __pos: __off_t,
+    pub __state: __mbstate_t,
+}
+#[test]
+fn bindgen_test_layout__G_fpos_t() {
+    const UNINIT: ::std::mem::MaybeUninit<_G_fpos_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_G_fpos_t>(),
+        16usize,
+        concat!("Size of: ", stringify!(_G_fpos_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_G_fpos_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_G_fpos_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__pos) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos_t),
+            "::",
+            stringify!(__pos)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__state) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos_t),
+            "::",
+            stringify!(__state)
+        )
+    );
+}
+pub type __fpos_t = _G_fpos_t;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct _G_fpos64_t {
+    pub __pos: __off64_t,
+    pub __state: __mbstate_t,
+}
+#[test]
+fn bindgen_test_layout__G_fpos64_t() {
+    const UNINIT: ::std::mem::MaybeUninit<_G_fpos64_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_G_fpos64_t>(),
+        16usize,
+        concat!("Size of: ", stringify!(_G_fpos64_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_G_fpos64_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_G_fpos64_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__pos) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos64_t),
+            "::",
+            stringify!(__pos)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__state) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos64_t),
+            "::",
+            stringify!(__state)
+        )
+    );
+}
+pub type __fpos64_t = _G_fpos64_t;
+pub type __FILE = _IO_FILE;
+pub type FILE = _IO_FILE;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_marker {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_codecvt {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_wide_data {
+    _unused: [u8; 0],
+}
+pub type _IO_lock_t = ::std::os::raw::c_void;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_FILE {
+    pub _flags: ::std::os::raw::c_int,
+    pub _IO_read_ptr: *mut ::std::os::raw::c_char,
+    pub _IO_read_end: *mut ::std::os::raw::c_char,
+    pub _IO_read_base: *mut ::std::os::raw::c_char,
+    pub _IO_write_base: *mut ::std::os::raw::c_char,
+    pub _IO_write_ptr: *mut ::std::os::raw::c_char,
+    pub _IO_write_end: *mut ::std::os::raw::c_char,
+    pub _IO_buf_base: *mut ::std::os::raw::c_char,
+    pub _IO_buf_end: *mut ::std::os::raw::c_char,
+    pub _IO_save_base: *mut ::std::os::raw::c_char,
+    pub _IO_backup_base: *mut ::std::os::raw::c_char,
+    pub _IO_save_end: *mut ::std::os::raw::c_char,
+    pub _markers: *mut _IO_marker,
+    pub _chain: *mut _IO_FILE,
+    pub _fileno: ::std::os::raw::c_int,
+    pub _flags2: ::std::os::raw::c_int,
+    pub _old_offset: __off_t,
+    pub _cur_column: ::std::os::raw::c_ushort,
+    pub _vtable_offset: ::std::os::raw::c_schar,
+    pub _shortbuf: [::std::os::raw::c_char; 1usize],
+    pub _lock: *mut _IO_lock_t,
+    pub _offset: __off64_t,
+    pub _codecvt: *mut _IO_codecvt,
+    pub _wide_data: *mut _IO_wide_data,
+    pub _freeres_list: *mut _IO_FILE,
+    pub _freeres_buf: *mut ::std::os::raw::c_void,
+    pub __pad5: usize,
+    pub _mode: ::std::os::raw::c_int,
+    pub _unused2: [::std::os::raw::c_char; 20usize],
+}
+#[test]
+fn bindgen_test_layout__IO_FILE() {
+    const UNINIT: ::std::mem::MaybeUninit<_IO_FILE> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_IO_FILE>(),
+        216usize,
+        concat!("Size of: ", stringify!(_IO_FILE))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_IO_FILE>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_IO_FILE))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._flags) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_flags)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_read_ptr) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_read_ptr)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_read_end) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_read_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_read_base) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_read_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_write_base) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_write_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_write_ptr) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_write_ptr)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_write_end) as usize - ptr as usize },
+        48usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_write_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_buf_base) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_buf_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_buf_end) as usize - ptr as usize },
+        64usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_buf_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_save_base) as usize - ptr as usize },
+        72usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_save_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_backup_base) as usize - ptr as usize },
+        80usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_backup_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_save_end) as usize - ptr as usize },
+        88usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_save_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._markers) as usize - ptr as usize },
+        96usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_markers)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._chain) as usize - ptr as usize },
+        104usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_chain)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._fileno) as usize - ptr as usize },
+        112usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_fileno)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._flags2) as usize - ptr as usize },
+        116usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_flags2)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._old_offset) as usize - ptr as usize },
+        120usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_old_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._cur_column) as usize - ptr as usize },
+        128usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_cur_column)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._vtable_offset) as usize - ptr as usize },
+        130usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_vtable_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._shortbuf) as usize - ptr as usize },
+        131usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_shortbuf)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._lock) as usize - ptr as usize },
+        136usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_lock)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._offset) as usize - ptr as usize },
+        144usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._codecvt) as usize - ptr as usize },
+        152usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_codecvt)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._wide_data) as usize - ptr as usize },
+        160usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_wide_data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._freeres_list) as usize - ptr as usize },
+        168usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_freeres_list)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._freeres_buf) as usize - ptr as usize },
+        176usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_freeres_buf)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__pad5) as usize - ptr as usize },
+        184usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(__pad5)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._mode) as usize - ptr as usize },
+        192usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_mode)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._unused2) as usize - ptr as usize },
+        196usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_unused2)
+        )
+    );
+}
+pub type cookie_read_function_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        __cookie: *mut ::std::os::raw::c_void,
+        __buf: *mut ::std::os::raw::c_char,
+        __nbytes: usize,
+    ) -> __ssize_t,
+>;
+pub type cookie_write_function_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        __cookie: *mut ::std::os::raw::c_void,
+        __buf: *const ::std::os::raw::c_char,
+        __nbytes: usize,
+    ) -> __ssize_t,
+>;
+pub type cookie_seek_function_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        __cookie: *mut ::std::os::raw::c_void,
+        __pos: *mut __off64_t,
+        __w: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int,
+>;
+pub type cookie_close_function_t = ::std::option::Option<
+    unsafe extern "C" fn(__cookie: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int,
+>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_cookie_io_functions_t {
+    pub read: cookie_read_function_t,
+    pub write: cookie_write_function_t,
+    pub seek: cookie_seek_function_t,
+    pub close: cookie_close_function_t,
+}
+#[test]
+fn bindgen_test_layout__IO_cookie_io_functions_t() {
+    const UNINIT: ::std::mem::MaybeUninit<_IO_cookie_io_functions_t> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_IO_cookie_io_functions_t>(),
+        32usize,
+        concat!("Size of: ", stringify!(_IO_cookie_io_functions_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_IO_cookie_io_functions_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_IO_cookie_io_functions_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).read) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(read)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).write) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(write)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).seek) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(seek)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).close) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(close)
+        )
+    );
+}
+pub type cookie_io_functions_t = _IO_cookie_io_functions_t;
+pub type off_t = __off_t;
+pub type off64_t = __off64_t;
+pub type fpos_t = __fpos_t;
+pub type fpos64_t = __fpos64_t;
+extern "C" {
+    pub static mut stdin: *mut FILE;
+}
+extern "C" {
+    pub static mut stdout: *mut FILE;
+}
+extern "C" {
+    pub static mut stderr: *mut FILE;
+}
+extern "C" {
+    pub fn remove(__filename: *const ::std::os::raw::c_char) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn rename(
+        __old: *const ::std::os::raw::c_char,
+        __new: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn renameat(
+        __oldfd: ::std::os::raw::c_int,
+        __old: *const ::std::os::raw::c_char,
+        __newfd: ::std::os::raw::c_int,
+        __new: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn renameat2(
+        __oldfd: ::std::os::raw::c_int,
+        __old: *const ::std::os::raw::c_char,
+        __newfd: ::std::os::raw::c_int,
+        __new: *const ::std::os::raw::c_char,
+        __flags: ::std::os::raw::c_uint,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fclose(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn tmpfile() -> *mut FILE;
+}
+extern "C" {
+    pub fn tmpfile64() -> *mut FILE;
+}
+extern "C" {
+    pub fn tmpnam(arg1: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn tmpnam_r(__s: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn tempnam(
+        __dir: *const ::std::os::raw::c_char,
+        __pfx: *const ::std::os::raw::c_char,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn fflush(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fflush_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fcloseall() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fopen(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn freopen(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+        __stream: *mut FILE,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn fopen64(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn freopen64(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+        __stream: *mut FILE,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn fdopen(__fd: ::std::os::raw::c_int, __modes: *const ::std::os::raw::c_char)
+        -> *mut FILE;
+}
+extern "C" {
+    pub fn fopencookie(
+        __magic_cookie: *mut ::std::os::raw::c_void,
+        __modes: *const ::std::os::raw::c_char,
+        __io_funcs: cookie_io_functions_t,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn fmemopen(
+        __s: *mut ::std::os::raw::c_void,
+        __len: usize,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn open_memstream(
+        __bufloc: *mut *mut ::std::os::raw::c_char,
+        __sizeloc: *mut usize,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn setbuf(__stream: *mut FILE, __buf: *mut ::std::os::raw::c_char);
+}
+extern "C" {
+    pub fn setvbuf(
+        __stream: *mut FILE,
+        __buf: *mut ::std::os::raw::c_char,
+        __modes: ::std::os::raw::c_int,
+        __n: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn setbuffer(__stream: *mut FILE, __buf: *mut ::std::os::raw::c_char, __size: usize);
+}
+extern "C" {
+    pub fn setlinebuf(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn fprintf(
+        __stream: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn printf(__format: *const ::std::os::raw::c_char, ...) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn sprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vfprintf(
+        __s: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vprintf(
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vsprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn snprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __maxlen: usize,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vsnprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __maxlen: usize,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vasprintf(
+        __ptr: *mut *mut ::std::os::raw::c_char,
+        __f: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn __asprintf(
+        __ptr: *mut *mut ::std::os::raw::c_char,
+        __fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn asprintf(
+        __ptr: *mut *mut ::std::os::raw::c_char,
+        __fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vdprintf(
+        __fd: ::std::os::raw::c_int,
+        __fmt: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn dprintf(
+        __fd: ::std::os::raw::c_int,
+        __fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fscanf(
+        __stream: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn scanf(__format: *const ::std::os::raw::c_char, ...) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn sscanf(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+pub type _Float32 = f32;
+pub type _Float64 = f64;
+pub type _Float32x = f64;
+pub type _Float64x = u128;
+extern "C" {
+    #[link_name = "\u{1}__isoc99_fscanf"]
+    pub fn fscanf1(
+        __stream: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_scanf"]
+    pub fn scanf1(__format: *const ::std::os::raw::c_char, ...) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_sscanf"]
+    pub fn sscanf1(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vfscanf(
+        __s: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vscanf(
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vsscanf(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_vfscanf"]
+    pub fn vfscanf1(
+        __s: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_vscanf"]
+    pub fn vscanf1(
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_vsscanf"]
+    pub fn vsscanf1(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fgetc(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getc(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getchar() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getc_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getchar_unlocked() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fgetc_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fputc(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putc(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putchar(__c: ::std::os::raw::c_int) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fputc_unlocked(__c: ::std::os::raw::c_int, __stream: *mut FILE)
+        -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putc_unlocked(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putchar_unlocked(__c: ::std::os::raw::c_int) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getw(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putw(__w: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fgets(
+        __s: *mut ::std::os::raw::c_char,
+        __n: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn fgets_unlocked(
+        __s: *mut ::std::os::raw::c_char,
+        __n: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn __getdelim(
+        __lineptr: *mut *mut ::std::os::raw::c_char,
+        __n: *mut usize,
+        __delimiter: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> __ssize_t;
+}
+extern "C" {
+    pub fn getdelim(
+        __lineptr: *mut *mut ::std::os::raw::c_char,
+        __n: *mut usize,
+        __delimiter: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> __ssize_t;
+}
+extern "C" {
+    pub fn getline(
+        __lineptr: *mut *mut ::std::os::raw::c_char,
+        __n: *mut usize,
+        __stream: *mut FILE,
+    ) -> __ssize_t;
+}
+extern "C" {
+    pub fn fputs(__s: *const ::std::os::raw::c_char, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn puts(__s: *const ::std::os::raw::c_char) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ungetc(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fread(
+        __ptr: *mut ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __stream: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fwrite(
+        __ptr: *const ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __s: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fputs_unlocked(
+        __s: *const ::std::os::raw::c_char,
+        __stream: *mut FILE,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fread_unlocked(
+        __ptr: *mut ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __stream: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fwrite_unlocked(
+        __ptr: *const ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __stream: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fseek(
+        __stream: *mut FILE,
+        __off: ::std::os::raw::c_long,
+        __whence: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ftell(__stream: *mut FILE) -> ::std::os::raw::c_long;
+}
+extern "C" {
+    pub fn rewind(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn fseeko(
+        __stream: *mut FILE,
+        __off: __off_t,
+        __whence: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ftello(__stream: *mut FILE) -> __off_t;
+}
+extern "C" {
+    pub fn fgetpos(__stream: *mut FILE, __pos: *mut fpos_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fsetpos(__stream: *mut FILE, __pos: *const fpos_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fseeko64(
+        __stream: *mut FILE,
+        __off: __off64_t,
+        __whence: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ftello64(__stream: *mut FILE) -> __off64_t;
+}
+extern "C" {
+    pub fn fgetpos64(__stream: *mut FILE, __pos: *mut fpos64_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fsetpos64(__stream: *mut FILE, __pos: *const fpos64_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn clearerr(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn feof(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ferror(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn clearerr_unlocked(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn feof_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ferror_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn perror(__s: *const ::std::os::raw::c_char);
+}
+extern "C" {
+    pub fn fileno(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fileno_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn pclose(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn popen(
+        __command: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn ctermid(__s: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn cuserid(__s: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct obstack {
+    _unused: [u8; 0],
+}
+extern "C" {
+    pub fn obstack_printf(
+        __obstack: *mut obstack,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn obstack_vprintf(
+        __obstack: *mut obstack,
+        __format: *const ::std::os::raw::c_char,
+        __args: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn flockfile(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn ftrylockfile(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn funlockfile(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn __uflow(arg1: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn __overflow(arg1: *mut FILE, arg2: ::std::os::raw::c_int) -> ::std::os::raw::c_int;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_model {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_context {
+    _unused: [u8; 0],
+}
+pub type llama_pos = i32;
+pub type llama_token = i32;
+pub type llama_seq_id = i32;
+pub const llama_vocab_type_LLAMA_VOCAB_TYPE_SPM: llama_vocab_type = 0;
+pub const llama_vocab_type_LLAMA_VOCAB_TYPE_BPE: llama_vocab_type = 1;
+pub type llama_vocab_type = ::std::os::raw::c_uint;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_UNDEFINED: llama_token_type = 0;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_NORMAL: llama_token_type = 1;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_UNKNOWN: llama_token_type = 2;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_CONTROL: llama_token_type = 3;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_USER_DEFINED: llama_token_type = 4;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_UNUSED: llama_token_type = 5;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_BYTE: llama_token_type = 6;
+pub type llama_token_type = ::std::os::raw::c_uint;
+pub const llama_ftype_LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1: llama_ftype = 3;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: llama_ftype = 4;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q8_0: llama_ftype = 7;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_0: llama_ftype = 8;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_1: llama_ftype = 9;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q2_K: llama_ftype = 10;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_S: llama_ftype = 11;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_M: llama_ftype = 12;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_L: llama_ftype = 13;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_S: llama_ftype = 14;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
+pub const llama_ftype_LLAMA_FTYPE_GUESSED: llama_ftype = 1024;
+pub type llama_ftype = ::std::os::raw::c_uint;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_UNSPECIFIED: llama_rope_scaling_type = -1;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_NONE: llama_rope_scaling_type = 0;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_LINEAR: llama_rope_scaling_type = 1;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_YARN: llama_rope_scaling_type = 2;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_MAX_VALUE: llama_rope_scaling_type = 2;
+pub type llama_rope_scaling_type = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_token_data {
+    pub id: llama_token,
+    pub logit: f32,
+    pub p: f32,
+}
+#[test]
+fn bindgen_test_layout_llama_token_data() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_token_data> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_token_data>(),
+        12usize,
+        concat!("Size of: ", stringify!(llama_token_data))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_token_data>(),
+        4usize,
+        concat!("Alignment of ", stringify!(llama_token_data))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).id) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_token_data),
+            "::",
             stringify!(id)
         )
     );
@@ -3462,26 +5540,259 @@ pub type llama_progress_callback =
     ::std::option::Option<unsafe extern "C" fn(progress: f32, ctx: *mut ::std::os::raw::c_void)>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct llama_context_params {
-    pub seed: u32,
-    pub n_ctx: i32,
-    pub n_batch: i32,
-    pub n_gqa: i32,
-    pub rms_norm_eps: f32,
+pub struct llama_batch {
+    pub n_tokens: i32,
+    pub token: *mut llama_token,
+    pub embd: *mut f32,
+    pub pos: *mut llama_pos,
+    pub n_seq_id: *mut i32,
+    pub seq_id: *mut *mut llama_seq_id,
+    pub logits: *mut i8,
+    pub all_pos_0: llama_pos,
+    pub all_pos_1: llama_pos,
+    pub all_seq_id: llama_seq_id,
+}
+#[test]
+fn bindgen_test_layout_llama_batch() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_batch> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_batch>(),
+        72usize,
+        concat!("Size of: ", stringify!(llama_batch))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_batch>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_batch))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_tokens) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(n_tokens)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).token) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(token)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).embd) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(embd)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).pos) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(pos)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_seq_id) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(n_seq_id)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).seq_id) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(seq_id)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).logits) as usize - ptr as usize },
+        48usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(logits)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).all_pos_0) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(all_pos_0)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).all_pos_1) as usize - ptr as usize },
+        60usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(all_pos_1)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).all_seq_id) as usize - ptr as usize },
+        64usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(all_seq_id)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_model_params {
     pub n_gpu_layers: i32,
     pub main_gpu: i32,
     pub tensor_split: *const f32,
-    pub rope_freq_base: f32,
-    pub rope_freq_scale: f32,
     pub progress_callback: llama_progress_callback,
     pub progress_callback_user_data: *mut ::std::os::raw::c_void,
-    pub low_vram: bool,
-    pub mul_mat_q: bool,
-    pub f16_kv: bool,
-    pub logits_all: bool,
     pub vocab_only: bool,
     pub use_mmap: bool,
     pub use_mlock: bool,
+}
+#[test]
+fn bindgen_test_layout_llama_model_params() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_model_params> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_model_params>(),
+        40usize,
+        concat!("Size of: ", stringify!(llama_model_params))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_model_params>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_model_params))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_gpu_layers) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(n_gpu_layers)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).main_gpu) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(main_gpu)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).tensor_split) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(tensor_split)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(progress_callback)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback_user_data) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(progress_callback_user_data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vocab_only) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(vocab_only)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).use_mmap) as usize - ptr as usize },
+        33usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(use_mmap)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).use_mlock) as usize - ptr as usize },
+        34usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(use_mlock)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_context_params {
+    pub seed: u32,
+    pub n_ctx: u32,
+    pub n_batch: u32,
+    pub n_threads: u32,
+    pub n_threads_batch: u32,
+    pub rope_scaling_type: i8,
+    pub rope_freq_base: f32,
+    pub rope_freq_scale: f32,
+    pub yarn_ext_factor: f32,
+    pub yarn_attn_factor: f32,
+    pub yarn_beta_fast: f32,
+    pub yarn_beta_slow: f32,
+    pub yarn_orig_ctx: u32,
+    pub mul_mat_q: bool,
+    pub f16_kv: bool,
+    pub logits_all: bool,
     pub embedding: bool,
 }
 #[test]
@@ -3490,12 +5801,12 @@ fn bindgen_test_layout_llama_context_params() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<llama_context_params>(),
-        72usize,
+        56usize,
         concat!("Size of: ", stringify!(llama_context_params))
     );
     assert_eq!(
         ::std::mem::align_of::<llama_context_params>(),
-        8usize,
+        4usize,
         concat!("Alignment of ", stringify!(llama_context_params))
     );
     assert_eq!(
@@ -3529,58 +5840,38 @@ fn bindgen_test_layout_llama_context_params() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_gqa) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
         12usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(n_gqa)
+            stringify!(n_threads)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).rms_norm_eps) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).n_threads_batch) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(rms_norm_eps)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_gpu_layers) as usize - ptr as usize },
-        20usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_context_params),
-            "::",
-            stringify!(n_gpu_layers)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).main_gpu) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_context_params),
-            "::",
-            stringify!(main_gpu)
+            stringify!(n_threads_batch)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).tensor_split) as usize - ptr as usize },
-        32usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).rope_scaling_type) as usize - ptr as usize },
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(tensor_split)
+            stringify!(rope_scaling_type)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).rope_freq_base) as usize - ptr as usize },
-        40usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
@@ -3590,7 +5881,7 @@ fn bindgen_test_layout_llama_context_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).rope_freq_scale) as usize - ptr as usize },
-        44usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
@@ -3599,98 +5890,88 @@ fn bindgen_test_layout_llama_context_params() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback) as usize - ptr as usize },
-        48usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_context_params),
-            "::",
-            stringify!(progress_callback)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback_user_data) as usize - ptr as usize },
-        56usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_ext_factor) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(progress_callback_user_data)
+            stringify!(yarn_ext_factor)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).low_vram) as usize - ptr as usize },
-        64usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_attn_factor) as usize - ptr as usize },
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(low_vram)
+            stringify!(yarn_attn_factor)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mul_mat_q) as usize - ptr as usize },
-        65usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_beta_fast) as usize - ptr as usize },
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(mul_mat_q)
+            stringify!(yarn_beta_fast)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).f16_kv) as usize - ptr as usize },
-        66usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_beta_slow) as usize - ptr as usize },
+        44usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(f16_kv)
+            stringify!(yarn_beta_slow)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).logits_all) as usize - ptr as usize },
-        67usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_orig_ctx) as usize - ptr as usize },
+        48usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(logits_all)
+            stringify!(yarn_orig_ctx)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vocab_only) as usize - ptr as usize },
-        68usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).mul_mat_q) as usize - ptr as usize },
+        52usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(vocab_only)
+            stringify!(mul_mat_q)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).use_mmap) as usize - ptr as usize },
-        69usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).f16_kv) as usize - ptr as usize },
+        53usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(use_mmap)
+            stringify!(f16_kv)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).use_mlock) as usize - ptr as usize },
-        70usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).logits_all) as usize - ptr as usize },
+        54usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(use_mlock)
+            stringify!(logits_all)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).embedding) as usize - ptr as usize },
-        71usize,
+        55usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
@@ -3699,24 +5980,6 @@ fn bindgen_test_layout_llama_context_params() {
         )
     );
 }
-pub const llama_ftype_LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1: llama_ftype = 3;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: llama_ftype = 4;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q8_0: llama_ftype = 7;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_0: llama_ftype = 8;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_1: llama_ftype = 9;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q2_K: llama_ftype = 10;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_S: llama_ftype = 11;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_M: llama_ftype = 12;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_L: llama_ftype = 13;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_S: llama_ftype = 14;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
-pub type llama_ftype = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct llama_model_quantize_params {
@@ -3724,6 +5987,8 @@ pub struct llama_model_quantize_params {
     pub ftype: llama_ftype,
     pub allow_requantize: bool,
     pub quantize_output_tensor: bool,
+    pub only_copy: bool,
+    pub pure_: bool,
 }
 #[test]
 fn bindgen_test_layout_llama_model_quantize_params() {
@@ -3780,6 +6045,26 @@ fn bindgen_test_layout_llama_model_quantize_params() {
             stringify!(quantize_output_tensor)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).only_copy) as usize - ptr as usize },
+        10usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_quantize_params),
+            "::",
+            stringify!(only_copy)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).pure_) as usize - ptr as usize },
+        11usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_quantize_params),
+            "::",
+            stringify!(pure_)
+        )
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -3955,7 +6240,7 @@ fn bindgen_test_layout_llama_timings() {
     );
 }
 extern "C" {
-    pub fn llama_max_devices() -> ::std::os::raw::c_int;
+    pub fn llama_model_default_params() -> llama_model_params;
 }
 extern "C" {
     pub fn llama_context_default_params() -> llama_context_params;
@@ -3963,6 +6248,36 @@ extern "C" {
 extern "C" {
     pub fn llama_model_quantize_default_params() -> llama_model_quantize_params;
 }
+extern "C" {
+    pub fn llama_backend_init(numa: bool);
+}
+extern "C" {
+    pub fn llama_backend_free();
+}
+extern "C" {
+    pub fn llama_load_model_from_file(
+        path_model: *const ::std::os::raw::c_char,
+        params: llama_model_params,
+    ) -> *mut llama_model;
+}
+extern "C" {
+    pub fn llama_free_model(model: *mut llama_model);
+}
+extern "C" {
+    pub fn llama_new_context_with_model(
+        model: *mut llama_model,
+        params: llama_context_params,
+    ) -> *mut llama_context;
+}
+extern "C" {
+    pub fn llama_free(ctx: *mut llama_context);
+}
+extern "C" {
+    pub fn llama_time_us() -> i64;
+}
+extern "C" {
+    pub fn llama_max_devices() -> ::std::os::raw::c_int;
+}
 extern "C" {
     pub fn llama_mmap_supported() -> bool;
 }
@@ -3970,66 +6285,284 @@ extern "C" {
     pub fn llama_mlock_supported() -> bool;
 }
 extern "C" {
-    pub fn llama_backend_init(numa: bool);
+    pub fn llama_get_model(ctx: *const llama_context) -> *const llama_model;
 }
 extern "C" {
-    pub fn llama_backend_free();
+    pub fn llama_n_ctx(ctx: *const llama_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_time_us() -> i64;
+    pub fn llama_vocab_type(model: *const llama_model) -> llama_vocab_type;
+}
+extern "C" {
+    pub fn llama_n_vocab(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_n_ctx_train(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_n_embd(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_rope_freq_scale_train(model: *const llama_model) -> f32;
+}
+extern "C" {
+    pub fn llama_model_meta_val_str(
+        model: *const llama_model,
+        key: *const ::std::os::raw::c_char,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_meta_count(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_meta_key_by_index(
+        model: *const llama_model,
+        i: ::std::os::raw::c_int,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_meta_val_str_by_index(
+        model: *const llama_model,
+        i: ::std::os::raw::c_int,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_desc(
+        model: *const llama_model,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_size(model: *const llama_model) -> u64;
+}
+extern "C" {
+    pub fn llama_model_n_params(model: *const llama_model) -> u64;
+}
+extern "C" {
+    pub fn llama_get_model_tensor(
+        model: *mut llama_model,
+        name: *const ::std::os::raw::c_char,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn llama_model_quantize(
+        fname_inp: *const ::std::os::raw::c_char,
+        fname_out: *const ::std::os::raw::c_char,
+        params: *const llama_model_quantize_params,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_apply_lora_from_file(
+        ctx: *mut llama_context,
+        path_lora: *const ::std::os::raw::c_char,
+        scale: f32,
+        path_base_model: *const ::std::os::raw::c_char,
+        n_threads: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_apply_lora_from_file(
+        model: *const llama_model,
+        path_lora: *const ::std::os::raw::c_char,
+        scale: f32,
+        path_base_model: *const ::std::os::raw::c_char,
+        n_threads: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_kv_cache_view_cell {
+    pub pos: llama_pos,
+}
+#[test]
+fn bindgen_test_layout_llama_kv_cache_view_cell() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_kv_cache_view_cell> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_kv_cache_view_cell>(),
+        4usize,
+        concat!("Size of: ", stringify!(llama_kv_cache_view_cell))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_kv_cache_view_cell>(),
+        4usize,
+        concat!("Alignment of ", stringify!(llama_kv_cache_view_cell))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).pos) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view_cell),
+            "::",
+            stringify!(pos)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_kv_cache_view {
+    pub n_cells: i32,
+    pub n_max_seq: i32,
+    pub token_count: i32,
+    pub used_cells: i32,
+    pub max_contiguous: i32,
+    pub max_contiguous_idx: i32,
+    pub cells: *mut llama_kv_cache_view_cell,
+    pub cells_sequences: *mut llama_seq_id,
+}
+#[test]
+fn bindgen_test_layout_llama_kv_cache_view() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_kv_cache_view> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_kv_cache_view>(),
+        40usize,
+        concat!("Size of: ", stringify!(llama_kv_cache_view))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_kv_cache_view>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_kv_cache_view))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_cells) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(n_cells)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_max_seq) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(n_max_seq)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).token_count) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(token_count)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).used_cells) as usize - ptr as usize },
+        12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(used_cells)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_contiguous) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(max_contiguous)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_contiguous_idx) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(max_contiguous_idx)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).cells) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(cells)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).cells_sequences) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(cells_sequences)
+        )
+    );
 }
 extern "C" {
-    pub fn llama_load_model_from_file(
-        path_model: *const ::std::os::raw::c_char,
-        params: llama_context_params,
-    ) -> *mut llama_model;
+    pub fn llama_kv_cache_view_init(
+        ctx: *const llama_context,
+        n_max_seq: i32,
+    ) -> llama_kv_cache_view;
 }
 extern "C" {
-    pub fn llama_free_model(model: *mut llama_model);
+    pub fn llama_kv_cache_view_free(view: *mut llama_kv_cache_view);
 }
 extern "C" {
-    pub fn llama_new_context_with_model(
-        model: *mut llama_model,
-        params: llama_context_params,
-    ) -> *mut llama_context;
+    pub fn llama_kv_cache_view_update(ctx: *const llama_context, view: *mut llama_kv_cache_view);
 }
 extern "C" {
-    pub fn llama_init_from_file(
-        path_model: *const ::std::os::raw::c_char,
-        params: llama_context_params,
-    ) -> *mut llama_context;
+    pub fn llama_get_kv_cache_token_count(ctx: *const llama_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_free(ctx: *mut llama_context);
+    pub fn llama_get_kv_cache_used_cells(ctx: *const llama_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_model_quantize(
-        fname_inp: *const ::std::os::raw::c_char,
-        fname_out: *const ::std::os::raw::c_char,
-        params: *const llama_model_quantize_params,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_kv_cache_clear(ctx: *mut llama_context);
 }
 extern "C" {
-    pub fn llama_apply_lora_from_file(
+    pub fn llama_kv_cache_seq_rm(
         ctx: *mut llama_context,
-        path_lora: *const ::std::os::raw::c_char,
-        path_base_model: *const ::std::os::raw::c_char,
-        n_threads: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+        seq_id: llama_seq_id,
+        p0: llama_pos,
+        p1: llama_pos,
+    );
 }
 extern "C" {
-    pub fn llama_model_apply_lora_from_file(
-        model: *const llama_model,
-        path_lora: *const ::std::os::raw::c_char,
-        path_base_model: *const ::std::os::raw::c_char,
-        n_threads: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_kv_cache_seq_cp(
+        ctx: *mut llama_context,
+        seq_id_src: llama_seq_id,
+        seq_id_dst: llama_seq_id,
+        p0: llama_pos,
+        p1: llama_pos,
+    );
 }
 extern "C" {
-    pub fn llama_get_kv_cache_token_count(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_kv_cache_seq_keep(ctx: *mut llama_context, seq_id: llama_seq_id);
 }
 extern "C" {
-    pub fn llama_set_rng_seed(ctx: *mut llama_context, seed: u32);
+    pub fn llama_kv_cache_seq_shift(
+        ctx: *mut llama_context,
+        seq_id: llama_seq_id,
+        p0: llama_pos,
+        p1: llama_pos,
+        delta: llama_pos,
+    );
 }
 extern "C" {
     pub fn llama_get_state_size(ctx: *const llama_context) -> usize;
@@ -4060,105 +6593,106 @@ extern "C" {
 extern "C" {
     pub fn llama_eval(
         ctx: *mut llama_context,
-        tokens: *const llama_token,
-        n_tokens: ::std::os::raw::c_int,
+        tokens: *mut llama_token,
+        n_tokens: i32,
         n_past: ::std::os::raw::c_int,
-        n_threads: ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn llama_eval_embd(
         ctx: *mut llama_context,
-        embd: *const f32,
-        n_tokens: ::std::os::raw::c_int,
+        embd: *mut f32,
+        n_tokens: i32,
         n_past: ::std::os::raw::c_int,
-        n_threads: ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_eval_export(
-        ctx: *mut llama_context,
-        fname: *const ::std::os::raw::c_char,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_batch_get_one(
+        tokens: *mut llama_token,
+        n_tokens: i32,
+        pos_0: llama_pos,
+        seq_id: llama_seq_id,
+    ) -> llama_batch;
 }
 extern "C" {
-    pub fn llama_tokenize(
-        ctx: *mut llama_context,
-        text: *const ::std::os::raw::c_char,
-        tokens: *mut llama_token,
-        n_max_tokens: ::std::os::raw::c_int,
-        add_bos: bool,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_batch_init(n_tokens: i32, embd: i32, n_seq_max: i32) -> llama_batch;
 }
 extern "C" {
-    pub fn llama_tokenize_with_model(
-        model: *const llama_model,
-        text: *const ::std::os::raw::c_char,
-        tokens: *mut llama_token,
-        n_max_tokens: ::std::os::raw::c_int,
-        add_bos: bool,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_batch_free(batch: llama_batch);
 }
 extern "C" {
-    pub fn llama_n_vocab(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_decode(ctx: *mut llama_context, batch: llama_batch) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_n_ctx(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_set_n_threads(ctx: *mut llama_context, n_threads: u32, n_threads_batch: u32);
 }
 extern "C" {
-    pub fn llama_n_embd(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_get_logits(ctx: *mut llama_context) -> *mut f32;
 }
 extern "C" {
-    pub fn llama_n_vocab_from_model(model: *const llama_model) -> ::std::os::raw::c_int;
+    pub fn llama_get_logits_ith(ctx: *mut llama_context, i: i32) -> *mut f32;
 }
 extern "C" {
-    pub fn llama_n_ctx_from_model(model: *const llama_model) -> ::std::os::raw::c_int;
+    pub fn llama_get_embeddings(ctx: *mut llama_context) -> *mut f32;
 }
 extern "C" {
-    pub fn llama_n_embd_from_model(model: *const llama_model) -> ::std::os::raw::c_int;
+    pub fn llama_token_get_text(
+        model: *const llama_model,
+        token: llama_token,
+    ) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn llama_get_vocab(
-        ctx: *const llama_context,
-        strings: *mut *const ::std::os::raw::c_char,
-        scores: *mut f32,
-        capacity: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_token_get_score(model: *const llama_model, token: llama_token) -> f32;
 }
 extern "C" {
-    pub fn llama_get_vocab_from_model(
-        model: *const llama_model,
-        strings: *mut *const ::std::os::raw::c_char,
-        scores: *mut f32,
-        capacity: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_token_get_type(model: *const llama_model, token: llama_token) -> llama_token_type;
 }
 extern "C" {
-    pub fn llama_get_logits(ctx: *mut llama_context) -> *mut f32;
+    pub fn llama_token_bos(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_get_embeddings(ctx: *mut llama_context) -> *mut f32;
+    pub fn llama_token_eos(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_to_str(
-        ctx: *const llama_context,
-        token: llama_token,
-    ) -> *const ::std::os::raw::c_char;
+    pub fn llama_token_nl(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_to_str_with_model(
-        model: *const llama_model,
-        token: llama_token,
-    ) -> *const ::std::os::raw::c_char;
+    pub fn llama_add_bos_token(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_add_eos_token(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_token_prefix(model: *const llama_model) -> llama_token;
+}
+extern "C" {
+    pub fn llama_token_middle(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_bos() -> llama_token;
+    pub fn llama_token_suffix(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_eos() -> llama_token;
+    pub fn llama_token_eot(model: *const llama_model) -> llama_token;
+}
+extern "C" {
+    #[doc = " @details Convert the provided text into tokens.\n @param tokens The tokens pointer must be large enough to hold the resulting tokens.\n @return Returns the number of tokens on success, no more than n_max_tokens\n @return Returns a negative number on failure - the number of tokens that would have been returned\n @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.\n                Does not insert a leading space."]
+    pub fn llama_tokenize(
+        model: *const llama_model,
+        text: *const ::std::os::raw::c_char,
+        text_len: ::std::os::raw::c_int,
+        tokens: *mut llama_token,
+        n_max_tokens: ::std::os::raw::c_int,
+        add_bos: bool,
+        special: bool,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_token_nl() -> llama_token;
+    pub fn llama_token_to_piece(
+        model: *const llama_model,
+        token: llama_token,
+        buf: *mut ::std::os::raw::c_char,
+        length: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn llama_grammar_init(
@@ -4171,24 +6705,21 @@ extern "C" {
     pub fn llama_grammar_free(grammar: *mut llama_grammar);
 }
 extern "C" {
-    #[doc = " @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix."]
-    pub fn llama_sample_repetition_penalty(
-        ctx: *mut llama_context,
-        candidates: *mut llama_token_data_array,
-        last_tokens: *const llama_token,
-        last_tokens_size: usize,
-        penalty: f32,
-    );
+    pub fn llama_grammar_copy(grammar: *const llama_grammar) -> *mut llama_grammar;
 }
 extern "C" {
-    #[doc = " @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."]
-    pub fn llama_sample_frequency_and_presence_penalties(
+    pub fn llama_set_rng_seed(ctx: *mut llama_context, seed: u32);
+}
+extern "C" {
+    #[doc = " @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.\n @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."]
+    pub fn llama_sample_repetition_penalties(
         ctx: *mut llama_context,
         candidates: *mut llama_token_data_array,
         last_tokens: *const llama_token,
-        last_tokens_size: usize,
-        alpha_frequency: f32,
-        alpha_presence: f32,
+        penalty_last_n: usize,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
     );
 }
 extern "C" {
@@ -4222,6 +6753,15 @@ extern "C" {
         min_keep: usize,
     );
 }
+extern "C" {
+    #[doc = " @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841"]
+    pub fn llama_sample_min_p(
+        ctx: *mut llama_context,
+        candidates: *mut llama_token_data_array,
+        p: f32,
+        min_keep: usize,
+    );
+}
 extern "C" {
     #[doc = " @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/."]
     pub fn llama_sample_tail_free(
@@ -4240,6 +6780,13 @@ extern "C" {
         min_keep: usize,
     );
 }
+extern "C" {
+    pub fn llama_sample_temp(
+        ctx: *mut llama_context,
+        candidates: *mut llama_token_data_array,
+        temp: f32,
+    );
+}
 extern "C" {
     pub fn llama_sample_temperature(
         ctx: *mut llama_context,
@@ -4277,7 +6824,7 @@ extern "C" {
     ) -> llama_token;
 }
 extern "C" {
-    #[doc = " @details Selects the token with the highest probability."]
+    #[doc = " @details Selects the token with the highest probability.\n          Does not compute the token probabilities. Use llama_sample_softmax() instead."]
     pub fn llama_sample_token_greedy(
         ctx: *mut llama_context,
         candidates: *mut llama_token_data_array,
@@ -4298,6 +6845,146 @@ extern "C" {
         token: llama_token,
     );
 }
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_beam_view {
+    pub tokens: *const llama_token,
+    pub n_tokens: usize,
+    pub p: f32,
+    pub eob: bool,
+}
+#[test]
+fn bindgen_test_layout_llama_beam_view() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_beam_view> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_beam_view>(),
+        24usize,
+        concat!("Size of: ", stringify!(llama_beam_view))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_beam_view>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_beam_view))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).tokens) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(tokens)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_tokens) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(n_tokens)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).p) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(p)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).eob) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(eob)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_beams_state {
+    pub beam_views: *mut llama_beam_view,
+    pub n_beams: usize,
+    pub common_prefix_length: usize,
+    pub last_call: bool,
+}
+#[test]
+fn bindgen_test_layout_llama_beams_state() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_beams_state> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_beams_state>(),
+        32usize,
+        concat!("Size of: ", stringify!(llama_beams_state))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_beams_state>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_beams_state))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).beam_views) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(beam_views)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_beams) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(n_beams)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).common_prefix_length) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(common_prefix_length)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).last_call) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(last_call)
+        )
+    );
+}
+pub type llama_beam_search_callback_fn_t = ::std::option::Option<
+    unsafe extern "C" fn(callback_data: *mut ::std::os::raw::c_void, arg1: llama_beams_state),
+>;
+extern "C" {
+    #[doc = " @details Deterministically returns entire sentence constructed by a beam search.\n @param ctx Pointer to the llama_context.\n @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.\n @param callback_data A pointer that is simply passed back to callback.\n @param n_beams Number of beams to use.\n @param n_past Number of tokens already evaluated.\n @param n_predict Maximum number of tokens to predict. EOS may occur earlier."]
+    pub fn llama_beam_search(
+        ctx: *mut llama_context,
+        callback: llama_beam_search_callback_fn_t,
+        callback_data: *mut ::std::os::raw::c_void,
+        n_beams: usize,
+        n_past: ::std::os::raw::c_int,
+        n_predict: ::std::os::raw::c_int,
+    );
+}
 extern "C" {
     pub fn llama_get_timings(ctx: *mut llama_context) -> llama_timings;
 }
@@ -4310,3 +6997,78 @@ extern "C" {
 extern "C" {
     pub fn llama_print_system_info() -> *const ::std::os::raw::c_char;
 }
+extern "C" {
+    pub fn llama_log_set(log_callback: ggml_log_callback, user_data: *mut ::std::os::raw::c_void);
+}
+extern "C" {
+    pub fn llama_dump_timing_info_yaml(stream: *mut FILE, ctx: *const llama_context);
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_backend_buffer {
+    pub _address: u8,
+}
+pub type __builtin_va_list = [__va_list_tag; 1usize];
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct __va_list_tag {
+    pub gp_offset: ::std::os::raw::c_uint,
+    pub fp_offset: ::std::os::raw::c_uint,
+    pub overflow_arg_area: *mut ::std::os::raw::c_void,
+    pub reg_save_area: *mut ::std::os::raw::c_void,
+}
+#[test]
+fn bindgen_test_layout___va_list_tag() {
+    const UNINIT: ::std::mem::MaybeUninit<__va_list_tag> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<__va_list_tag>(),
+        24usize,
+        concat!("Size of: ", stringify!(__va_list_tag))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<__va_list_tag>(),
+        8usize,
+        concat!("Alignment of ", stringify!(__va_list_tag))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gp_offset) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(gp_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).fp_offset) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(fp_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).overflow_arg_area) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(overflow_arg_area)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).reg_save_area) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(reg_save_area)
+        )
+    );
+}
diff --git a/crates/llm-chain-llama/examples/alpaca.rs b/crates/llm-chain-llama/examples/alpaca.rs
index 6e581976..3b92d0f6 100644
--- a/crates/llm-chain-llama/examples/alpaca.rs
+++ b/crates/llm-chain-llama/examples/alpaca.rs
@@ -1,7 +1,7 @@
 use llm_chain::executor;
 use llm_chain::{parameters, prompt};
 
-#[tokio::main(flavor = "current_thread")]
+#[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let exec = executor!(llama)?;
     let res = prompt!("Write a hypothetical weather report for {season} in {location}.")
diff --git a/crates/llm-chain-llama/examples/few_shot.rs b/crates/llm-chain-llama/examples/few_shot.rs
index e3f3e389..dcc09d22 100644
--- a/crates/llm-chain-llama/examples/few_shot.rs
+++ b/crates/llm-chain-llama/examples/few_shot.rs
@@ -1,4 +1,3 @@
-use llm_chain::options;
 use llm_chain::prompt::Conversation;
 use llm_chain::{chains::conversation::Chain, executor, parameters, prompt, step::Step};
 /// This example demonstrates how to use the llm-chain for few-shot prompting
@@ -10,12 +9,7 @@ use llm_chain::{chains::conversation::Chain, executor, parameters, prompt, step:
 /// Make sure to have the env var 'LLM_CHAIN_MODEL' set
 #[tokio::main(flavor = "multi_thread", worker_threads = 1)]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let opts = options!(
-        NThreads: 4_usize,
-        StopSequence: vec!["\n".to_string()]
-    );
-
-    let exec_1 = executor!(llama, opts.clone())?;
+    let exec_1 = executor!(llama)?;
 
     let user_prompt =
         "Take the last letters of the words in '{{ full_name }}' and concatenate them";
@@ -47,7 +41,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Define the step
     let step = Step::for_prompt_template(prompt!(user: user_prompt));
     // Execute the chain.
-    let exec_2 = executor!(llama, opts)?;
+    let exec_2 = executor!(llama)?;
     let res = chain
         .send_message(step, &parameters!().with("full_name", "Elon Musk"), &exec_2)
         .await?;
diff --git a/crates/llm-chain-llama/examples/map_reduce_llama.rs b/crates/llm-chain-llama/examples/map_reduce_llama.rs
index 884304da..87102cff 100644
--- a/crates/llm-chain-llama/examples/map_reduce_llama.rs
+++ b/crates/llm-chain-llama/examples/map_reduce_llama.rs
@@ -1,14 +1,35 @@
 use llm_chain::chains::map_reduce::Chain;
 use llm_chain::executor;
+use llm_chain::options;
 use llm_chain::{prompt, step::Step, Parameters};
 
-#[tokio::main(flavor = "current_thread")]
+#[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let exec = executor!(llama)?;
+    let opts = options!(
+        ModelType: "llama",
+        MaxContextSize: 4096_usize,
+        NThreads: 4_usize,
+        MaxTokens: 2048_usize,
+        MaxBatchSize: 4096_usize,
+        TopK: 40_i32,
+        TopP: 0.95,
+        TfsZ: 1.0,
+        TypicalP: 1.0,
+        Temperature: 0.8,
+        RepeatPenalty: 1.1,
+        RepeatPenaltyLastN: 64_usize,
+        FrequencyPenalty: 0.0,
+        PresencePenalty: 0.0,
+        Mirostat: 0_i32,
+        MirostatTau: 5.0,
+        MirostatEta: 0.1,
+        PenalizeNl: true,
+        StopSequence: vec!["\n\n".to_string()]
+    );
+    let exec = executor!(llama, opts.clone())?;
     let map_prompt = Step::for_prompt_template(prompt!("== ARTICLE ==\n{{text}}== SUMMARY ==\n"));
     let reduce_prompt =
         Step::for_prompt_template(prompt!("== ARTICLE ==\n{{text}}== FINAL SUMMARY ==\n"));
-
     let chain = Chain::new(map_prompt, reduce_prompt);
     let article = include_str!("article_to_summarize.md");
     let docs = vec![Parameters::new_with_text(article)];
diff --git a/crates/llm-chain-llama/examples/simple_llama.rs b/crates/llm-chain-llama/examples/simple_llama.rs
index 079c0acb..28cdd6cf 100644
--- a/crates/llm-chain-llama/examples/simple_llama.rs
+++ b/crates/llm-chain-llama/examples/simple_llama.rs
@@ -30,7 +30,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         ModelType: "llama",
         MaxContextSize: 512_usize,
         NThreads: 4_usize,
-        MaxTokens: 0_usize,
+        MaxTokens: 512_usize,
         TopK: 40_i32,
         TopP: 0.95,
         TfsZ: 1.0,
diff --git a/crates/llm-chain-llama/examples/stream.rs b/crates/llm-chain-llama/examples/stream.rs
index 0274e730..906f16b3 100644
--- a/crates/llm-chain-llama/examples/stream.rs
+++ b/crates/llm-chain-llama/examples/stream.rs
@@ -4,10 +4,9 @@ use llm_chain::{executor, parameters, prompt};
 /// This example demonstrates how to use the llm-chain-llama crate to generate streaming text using a
 /// LLaMA model.
 ///
-/// Usage: cargo run --example simple path/to/llama-or-alpaca-model
+/// Usage: cargo run --example stream
 ///
-/// For example, if the model is located at "/models/llama"
-/// cargo run --example simple /models/llama
+/// Make sure to have the env var 'LLM_CHAIN_MODEL' set.
 #[tokio::main(flavor = "current_thread")]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let exec = executor!(llama)?;
diff --git a/crates/llm-chain-llama/src/batch.rs b/crates/llm-chain-llama/src/batch.rs
new file mode 100644
index 00000000..17af1c73
--- /dev/null
+++ b/crates/llm-chain-llama/src/batch.rs
@@ -0,0 +1,118 @@
+use llm_chain_llama_sys::{llama_batch, llama_batch_free, llama_seq_id};
+use std::ptr::null_mut;
+
+#[derive(Debug, Clone)]
+#[allow(dead_code)]
+pub struct LlamaBatch {
+    n_tokens: i32,
+    token: Vec<i32>,
+    embd: Vec<f32>,
+    pos: Vec<i32>,
+    n_seq_id: Vec<i32>,
+    seq_id: Vec<Vec<i32>>,
+    logits: Vec<bool>,
+    all_pos_0: i32,
+    all_pos_1: i32,
+    all_seq_id: i32,
+}
+
+impl LlamaBatch {
+    pub fn new_with_tokens(tokens: Vec<i32>, max_seq: i32) -> Self {
+        let pos = (0..tokens.len()).map(|p| p as i32).collect::<Vec<_>>();
+        let embd = vec![];
+        let n_seq_id = vec![max_seq; tokens.len()];
+        let seq_id = vec![vec![0; tokens.len()]; tokens.len()];
+        let logits = vec![false; tokens.len()];
+        let all_pos_0 = 0;
+        let all_pos_1 = 0;
+        let all_seq_id = 0;
+
+        Self {
+            n_tokens: tokens.len() as i32,
+            token: tokens,
+            embd,
+            pos,
+            n_seq_id,
+            seq_id,
+            logits,
+            all_pos_0,
+            all_pos_1,
+            all_seq_id,
+        }
+    }
+
+    pub fn new_with_token(token: i32, pos: i32) -> Self {
+        Self {
+            n_tokens: 1,
+            token: vec![token],
+            embd: vec![],
+            pos: vec![pos],
+            n_seq_id: vec![1],
+            seq_id: vec![vec![0]],
+            logits: vec![true],
+            all_pos_0: 0,
+            all_pos_1: 0,
+            all_seq_id: 0,
+        }
+    }
+
+    pub fn token_count(&self) -> usize {
+        self.n_tokens as usize
+    }
+
+    pub fn enable_logits(&mut self, pos: usize) {
+        self.logits[pos] = true;
+    }
+}
+
+impl Drop for LlamaBatch {
+    fn drop(&mut self) {
+        unsafe {
+            llama_batch_free(self.into());
+        }
+    }
+}
+
+fn convert_llama_batch(batch: &LlamaBatch) -> llama_batch {
+    let n_tokens = batch.n_tokens;
+    let token_ptr = Box::leak(batch.token.clone().into_boxed_slice()).as_mut_ptr();
+    let embd_ptr = if batch.embd.is_empty() {
+        null_mut()
+    } else {
+        Box::leak(batch.embd.clone().into_boxed_slice()).as_mut_ptr()
+    };
+    let pos_ptr = Box::leak(batch.pos.clone().into_boxed_slice()).as_mut_ptr();
+    let n_seq_id_ptr = Box::leak(batch.n_seq_id.clone().into_boxed_slice()).as_mut_ptr();
+    let raw_pointers = batch
+        .seq_id
+        .clone()
+        .into_iter()
+        .map(|inner_vec| Box::leak(inner_vec.into_boxed_slice()).as_mut_ptr())
+        .collect::<Vec<*mut llama_seq_id>>();
+    let seq_id_ptr = Box::leak(raw_pointers.into_boxed_slice()).as_mut_ptr();
+    let logits_ptr = Box::leak(batch.logits.clone().into_boxed_slice()).as_mut_ptr();
+    llama_batch {
+        n_tokens,
+        token: token_ptr,
+        embd: embd_ptr,
+        pos: pos_ptr,
+        n_seq_id: n_seq_id_ptr,
+        seq_id: seq_id_ptr,
+        logits: logits_ptr as *mut i8,
+        all_pos_0: batch.all_pos_0,
+        all_pos_1: batch.all_pos_1,
+        all_seq_id: batch.all_seq_id,
+    }
+}
+
+impl From<&LlamaBatch> for llama_batch {
+    fn from(batch: &LlamaBatch) -> Self {
+        convert_llama_batch(batch)
+    }
+}
+
+impl From<&mut LlamaBatch> for llama_batch {
+    fn from(batch: &mut LlamaBatch) -> Self {
+        convert_llama_batch(batch)
+    }
+}
diff --git a/crates/llm-chain-llama/src/context.rs b/crates/llm-chain-llama/src/context.rs
index cbae028f..a84894ce 100644
--- a/crates/llm-chain-llama/src/context.rs
+++ b/crates/llm-chain-llama/src/context.rs
@@ -1,20 +1,22 @@
-use std::{
-    ffi::{CStr, CString},
-    ptr::null_mut,
-};
+use std::ffi::{CStr, CString};
 
+use crate::batch;
+use crate::model::ModelParams;
 use crate::options::LlamaInvocation;
 use anyhow::Result;
 use llm_chain_llama_sys::{
-    llama_context, llama_context_default_params, llama_context_params, llama_eval, llama_free,
-    llama_get_logits, llama_init_from_file, llama_n_vocab,
-    llama_sample_frequency_and_presence_penalties, llama_sample_repetition_penalty,
+    llama_context, llama_context_default_params, llama_context_params, llama_decode, llama_eval,
+    llama_free, llama_get_logits, llama_get_logits_ith, llama_load_model_from_file, llama_model,
+    llama_n_vocab, llama_new_context_with_model, llama_sample_repetition_penalties,
     llama_sample_tail_free, llama_sample_temperature, llama_sample_token,
     llama_sample_token_greedy, llama_sample_token_mirostat, llama_sample_token_mirostat_v2,
     llama_sample_top_k, llama_sample_top_p, llama_sample_typical, llama_token_data,
-    llama_token_data_array, llama_token_nl, llama_token_to_str,
+    llama_token_data_array, llama_token_eos, llama_token_get_text, llama_token_nl,
+    llama_token_to_piece,
 };
 
+pub use batch::LlamaBatch;
+
 #[derive(Debug, thiserror::Error)]
 #[error("LLAMA.cpp returned error-code {0}")]
 pub struct LLAMACPPErrorCode(i32);
@@ -22,13 +24,22 @@ pub struct LLAMACPPErrorCode(i32);
 // Represents the configuration parameters for a LLamaContext.
 #[derive(Debug, Clone)]
 pub struct ContextParams {
-    pub n_parts: i32,
-    pub n_ctx: i32,
-    pub seed: i32,
+    pub seed: u32,
+    pub n_ctx: u32,
+    pub n_batch: u32,
+    pub n_threads: u32,
+    pub n_threads_batch: u32,
+    pub rope_scaling_type: i8,
+    pub rope_freq_base: f32,
+    pub rope_freq_scale: f32,
+    pub yarn_ext_factor: f32,
+    pub yarn_attn_factor: f32,
+    pub yarn_beta_fast: f32,
+    pub yarn_beta_slow: f32,
+    pub yarn_orig_ctx: u32,
+    pub mul_mat_q: bool,
     pub f16_kv: bool,
-    pub vocab_only: bool,
-    pub use_mlock: bool,
-    pub use_mmap: bool,
+    pub logits_all: bool,
     pub embedding: bool,
 }
 
@@ -57,17 +68,23 @@ impl Default for ContextParams {
 impl From<ContextParams> for llama_context_params {
     fn from(params: ContextParams) -> Self {
         llama_context_params {
-            n_parts: params.n_parts,
-            n_ctx: params.n_ctx,
             seed: params.seed,
+            n_ctx: params.n_ctx,
+            n_batch: params.n_batch,
+            n_threads: params.n_threads,
+            n_threads_batch: params.n_threads_batch,
+            rope_scaling_type: params.rope_scaling_type,
+            rope_freq_base: params.rope_freq_base,
+            rope_freq_scale: params.rope_freq_scale,
+            yarn_ext_factor: params.yarn_ext_factor,
+            yarn_attn_factor: params.yarn_attn_factor,
+            yarn_beta_fast: params.yarn_beta_fast,
+            yarn_beta_slow: params.yarn_beta_slow,
+            yarn_orig_ctx: params.yarn_orig_ctx,
+            mul_mat_q: params.mul_mat_q,
             f16_kv: params.f16_kv,
             logits_all: false,
-            vocab_only: params.vocab_only,
-            use_mlock: params.use_mlock,
-            use_mmap: params.use_mmap,
             embedding: params.embedding,
-            progress_callback: None,
-            progress_callback_user_data: null_mut(),
         }
     }
 }
@@ -75,13 +92,22 @@ impl From<ContextParams> for llama_context_params {
 impl From<llama_context_params> for ContextParams {
     fn from(params: llama_context_params) -> Self {
         ContextParams {
-            n_ctx: params.n_ctx,
-            n_parts: params.n_parts,
             seed: params.seed,
+            n_ctx: params.n_ctx,
+            n_batch: params.n_batch,
+            n_threads: params.n_threads,
+            n_threads_batch: params.n_threads_batch,
+            rope_scaling_type: params.rope_scaling_type,
+            rope_freq_base: params.rope_freq_base,
+            rope_freq_scale: params.rope_freq_scale,
+            yarn_ext_factor: params.yarn_ext_factor,
+            yarn_attn_factor: params.yarn_attn_factor,
+            yarn_beta_fast: params.yarn_beta_fast,
+            yarn_beta_slow: params.yarn_beta_slow,
+            yarn_orig_ctx: params.yarn_orig_ctx,
+            mul_mat_q: params.mul_mat_q,
             f16_kv: params.f16_kv,
-            vocab_only: params.vocab_only,
-            use_mlock: params.use_mlock,
-            use_mmap: params.use_mmap,
+            logits_all: params.logits_all,
             embedding: params.embedding,
         }
     }
@@ -90,21 +116,31 @@ impl From<llama_context_params> for ContextParams {
 // Represents the LLamaContext which wraps FFI calls to the llama.cpp library.
 pub(crate) struct LLamaContext {
     ctx: *mut llama_context,
+    pub model: *mut llama_model,
 }
 
+#[allow(dead_code)]
 impl LLamaContext {
     // Creates a new LLamaContext from the specified file and configuration parameters.
     pub fn from_file_and_params(
         path: &str,
-        params: Option<&ContextParams>,
+        model_params: Option<&ModelParams>,
+        context_params: Option<&ContextParams>,
     ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
         let path = CString::new(path).expect("could not convert to CString");
-        let params = ContextParams::or_default(params);
-        let ctx = unsafe { llama_init_from_file(path.into_raw() as *const i8, params) };
+        let model_params = ModelParams::or_default(model_params);
+        let model =
+            unsafe { llama_load_model_from_file(path.into_raw() as *const i8, model_params) };
+        if model.is_null() {
+            return Err("Initializing llama model returned nullptr".into());
+        }
+
+        let context_params = ContextParams::or_default(context_params);
+        let ctx = unsafe { llama_new_context_with_model(model, context_params) };
         if ctx.is_null() {
             return Err("Initializing llama context returned nullptr".into());
         }
-        Ok(Self { ctx })
+        Ok(Self { ctx, model })
     }
 
     // Token logits obtained from the last call to llama_eval()
@@ -117,7 +153,12 @@ impl LLamaContext {
         unsafe { std::slice::from_raw_parts_mut(llama_get_logits(self.ctx), len) }.to_vec()
     }
     pub fn llama_n_vocab(&self) -> i32 {
-        unsafe { llama_n_vocab(self.ctx) }
+        unsafe { llama_n_vocab(self.model) }
+    }
+
+    pub fn llama_get_logits_ith(&self, index: usize) -> Vec<f32> {
+        let float_ptr = unsafe { llama_get_logits_ith(self.ctx, index as i32) };
+        Vec::from(unsafe { std::slice::from_raw_parts(float_ptr, self.llama_n_vocab() as usize) })
     }
 
     // Executes the LLama sampling process with the specified configuration.
@@ -127,6 +168,7 @@ impl LLamaContext {
         last_n_tokens_data: &[i32],
         last_n_tokens_size: i32,
         input: &LlamaInvocation,
+        batch_n_tokens: i32,
     ) -> i32 {
         let top_k = if input.top_k <= 0 {
             self.llama_n_vocab()
@@ -140,7 +182,7 @@ impl LLamaContext {
         };
         let n_vocab = self.llama_n_vocab() as usize;
         // only get the last row, as the sample only requires this.
-        let mut logits = self.llama_get_logits_as_slice(1, n_vocab);
+        let mut logits = self.llama_get_logits_ith((batch_n_tokens - 1) as usize);
 
         // let id : llama_token = 0;
         input
@@ -160,11 +202,11 @@ impl LLamaContext {
             size: candidates.len(),
             sorted: false,
         };
-        let nl_logit = logits[unsafe { llama_token_nl() } as usize];
+        let nl_logit = logits[unsafe { llama_token_nl(self.model) } as usize];
         let last_n_repeat = i32::min(i32::min(last_n_tokens_size, repeat_last_n), n_ctx) as usize;
 
         unsafe {
-            llama_sample_repetition_penalty(
+            llama_sample_repetition_penalties(
                 self.ctx,
                 &mut candidates_p,
                 last_n_tokens_data
@@ -172,22 +214,12 @@ impl LLamaContext {
                     .add((last_n_tokens_size - last_n_repeat as i32) as usize),
                 last_n_repeat,
                 input.repeat_penalty,
-            )
-        };
-        unsafe {
-            llama_sample_frequency_and_presence_penalties(
-                self.ctx,
-                &mut candidates_p,
-                last_n_tokens_data
-                    .as_ptr()
-                    .add((last_n_tokens_size - last_n_repeat as i32) as usize),
-                last_n_repeat,
                 input.frequency_penalty,
                 input.presence_penalty,
             )
         };
         if !input.penalize_nl {
-            logits[unsafe { llama_token_nl() as usize }] = nl_logit;
+            logits[unsafe { llama_token_nl(self.model) as usize }] = nl_logit;
         }
 
         if input.temp <= 0.0 {
@@ -231,26 +263,74 @@ impl LLamaContext {
     }
 
     pub fn llama_token_to_bytes(&self, token: &i32) -> Vec<u8> {
-        let c_ptr = unsafe { llama_token_to_str(self.ctx, *token) };
+        let c_ptr = unsafe { llama_token_get_text(self.model, *token) };
         unsafe { CStr::from_ptr(c_ptr) }.to_bytes().to_vec()
     }
 
     // Evaluates the given tokens with the specified configuration.
     pub fn llama_eval(
         &self,
-        tokens: &[i32],
+        tokens: &mut [i32],
         n_tokens: i32,
         n_past: i32,
-        input: &LlamaInvocation,
+        _input: &LlamaInvocation,
     ) -> Result<(), LLAMACPPErrorCode> {
-        let res =
-            unsafe { llama_eval(self.ctx, tokens.as_ptr(), n_tokens, n_past, input.n_threads) };
+        let res = unsafe { llama_eval(self.ctx, tokens.as_mut_ptr(), n_tokens, n_past) };
         if res == 0 {
             Ok(())
         } else {
             Err(LLAMACPPErrorCode(res))
         }
     }
+
+    // Evaluates the provided batch.
+    pub fn llama_decode(&self, batch: &LlamaBatch) -> Result<(), LLAMACPPErrorCode> {
+        let res = unsafe { llama_decode(self.ctx, batch.into()) };
+        if res == 0 {
+            Ok(())
+        } else {
+            Err(LLAMACPPErrorCode(res))
+        }
+    }
+
+    pub fn llama_token_eos(&self) -> i32 {
+        unsafe { llama_token_eos(self.model) }
+    }
+
+    pub fn llama_token_nl(&self) -> i32 {
+        unsafe { llama_token_nl(self.model) }
+    }
+
+    pub fn llama_token_to_piece(
+        &self,
+        token_id: i32,
+    ) -> Result<String, std::string::FromUtf8Error> {
+        let mut result = vec![0 as i8; 8];
+        let n_tokens = unsafe {
+            llama_token_to_piece(
+                self.model,
+                token_id,
+                result.as_mut_ptr(),
+                result.len() as i32,
+            )
+        };
+        if n_tokens < 0 {
+            result.resize(-n_tokens as usize, 0 as i8);
+            let check = unsafe {
+                llama_token_to_piece(
+                    self.model,
+                    token_id,
+                    result.as_mut_ptr(),
+                    result.len() as i32,
+                )
+            };
+            assert_eq!(check, -n_tokens);
+        } else {
+            result.resize(n_tokens as usize, 0 as i8);
+        }
+        let result_bytes: Vec<u8> = result.into_iter().map(|b| b as u8).collect();
+        String::from_utf8(result_bytes)
+    }
 }
 
 // Provides thread-safe behavior for LLamaContext.
diff --git a/crates/llm-chain-llama/src/executor.rs b/crates/llm-chain-llama/src/executor.rs
index e5118792..174580e6 100644
--- a/crates/llm-chain-llama/src/executor.rs
+++ b/crates/llm-chain-llama/src/executor.rs
@@ -1,9 +1,9 @@
 use std::marker::PhantomData;
 use std::sync::Arc;
 
-use crate::context::{ContextParams, LLamaContext};
+use crate::context::{ContextParams, LLamaContext, LlamaBatch};
 use crate::options::{get_executor_initial_opts, LlamaInvocation, DEFAULT_OPTIONS};
-use crate::tokenizer::{embedding_to_output, llama_token_eos, tokenize, tokens_to_string};
+use crate::tokenizer::{embedding_to_output, tokenize};
 
 use async_trait::async_trait;
 
@@ -61,6 +61,7 @@ impl Executor {
         tokio::task::spawn_blocking(move || {
             let context_size = context_size;
             let context = context.blocking_lock();
+
             let tokenized_stop_prompt = tokenize(
                 &context,
                 input
@@ -69,6 +70,7 @@ impl Executor {
                     .map(|x| x.as_str())
                     .unwrap_or("\n\n"),
                 false,
+                true,
             );
 
             if tokenized_stop_prompt.len() > context_size {
@@ -77,68 +79,78 @@ impl Executor {
             }
 
             let prompt_text = input.prompt.to_text();
-            let tokenized_input = tokenize(&context, prompt_text.as_str(), true);
+
+            let tokenized_input = tokenize(&context, prompt_text.as_str(), true, false);
             if tokenized_input.len() > context_size {
                 must_send!(sender, StreamSegment::Err(ExecutorError::ContextTooSmall));
                 return;
             }
 
-            // Embd contains the prompt and the completion. The longer the prompt, the shorter the completion.
+            // embd contains the prompt and the completion. The longer the
+            // prompt, the shorter the completion.
+            // It will initially contain a copy the tokenized prompt and then
+            // may be extended with the tokenized answer prefix. After each
+            // sampling the sampled token will also be added to this vector.
+            // This is done so that the sampling function has access to all the
+            // tokens which it may need for repetition penalties, etc.
             let mut embd = tokenized_input.clone();
 
-            // Evaluate the prompt in full.
+            let mut batch = LlamaBatch::new_with_tokens(tokenized_input.clone(), 1);
+            let last_idx = (batch.token_count() - 1) as usize;
+            batch.enable_logits(last_idx);
+
             bail!(
                 context
-                    .llama_eval(
-                        tokenized_input.as_slice(),
-                        tokenized_input.len() as i32,
-                        0,
-                        &input,
-                    )
+                    .llama_decode(&batch)
                     .map_err(|e| ExecutorError::InnerError(e.into())),
                 sender
             );
+            let mut n_cur = batch.token_count();
+            let mut n_used = (batch.token_count() - 1) as usize;
 
             let mut n_remaining = context_size - tokenized_input.len();
-            let mut n_used = tokenized_input.len() - 1;
             if let Some(prefix) = answer_prefix {
-                let tokenized_answer_prefix = tokenize(&context, prefix.as_str(), false);
+                let tokenized_answer_prefix = tokenize(&context, prefix.as_str(), true, true);
                 if tokenized_answer_prefix.len() > context_size {
                     must_send!(sender, StreamSegment::Err(ExecutorError::ContextTooSmall));
                     return;
                 }
-
+                let batch = LlamaBatch::new_with_tokens(tokenized_answer_prefix.clone(), 1);
                 // Evaluate the answer prefix (the role -- should be Assistant: )
                 bail!(
                     context
-                        .llama_eval(
-                            tokenized_answer_prefix.as_slice(),
-                            tokenized_answer_prefix.len() as i32,
-                            n_used as i32,
-                            &input,
-                        )
+                        .llama_decode(&batch)
                         .map_err(|e| ExecutorError::InnerError(e.into())),
                     sender
                 );
                 n_remaining -= tokenized_answer_prefix.len();
-                n_used += tokenized_answer_prefix.len();
                 embd.extend(tokenized_answer_prefix);
+                n_cur += batch.token_count();
+                n_used += (batch.token_count() - 1) as usize;
             }
             embd.resize(context_size, 0);
-            let token_eos = llama_token_eos();
+            let token_eos = context.llama_token_eos();
+
             let mut stop_sequence_i = 0;
+            let mut n_batch = batch.token_count();
+            let mut n_samples = 0;
+            let ignore_initial_nls = input.prompt.to_text().ends_with('?');
+            let nl_token = context.llama_token_nl();
+
             // Generate remaining tokens.
-            let mut leftover_bytes: Vec<u8> = vec![];
             while n_remaining > 0 {
                 let tok = context.llama_sample(
                     context_size as i32,
                     embd.as_slice(),
                     n_used as i32,
                     &input,
+                    n_batch as i32,
                 );
+                n_samples += 1;
                 n_used += 1;
                 n_remaining -= 1;
                 embd[n_used] = tok;
+
                 if tok == token_eos {
                     break;
                 }
@@ -147,47 +159,43 @@ impl Executor {
                 {
                     break;
                 }
+
+                // If the input prompt is in the form of a question then next
+                // predicted tok will be a new line to finish off the question
+                // itself, followed by another new line before the actual
+                // answer. This is what the following is checking for.
+                if n_samples <= 2 && ignore_initial_nls && tok == nl_token {
+                    continue;
+                }
+
                 if tok == tokenized_stop_prompt[stop_sequence_i] {
                     stop_sequence_i += 1;
                     if stop_sequence_i >= tokenized_stop_prompt.len() {
                         break;
                     }
                 } else {
-                    let str_output =
-                        tokens_to_string(&context, &embd[n_used - stop_sequence_i..n_used]);
-                    // XXX: make into chat if chat
-                    must_send!(sender, StreamSegment::Content(str_output));
+                    let piece = bail!(
+                        context
+                            .llama_token_to_piece(tok)
+                            .map_err(|e| ExecutorError::InnerError(e.into())),
+                        sender
+                    );
+                    must_send!(sender, StreamSegment::Content(piece));
                     stop_sequence_i = 0;
-                }
-                bail!(
-                    context
-                        .llama_eval(&embd[n_used..], 1, n_used as i32, &input)
-                        .map_err(|e| ExecutorError::InnerError(e.into())),
-                    sender
-                );
 
-                if n_used >= tokenized_input.len() && stop_sequence_i == 0 {
-                    let bytes_output: Vec<u8> =
-                        [leftover_bytes, context.llama_token_to_bytes(&embd[n_used])].concat();
+                    let batch = LlamaBatch::new_with_token(tok, n_cur as i32);
 
-                    let (str_output, leftover) = decode_up_to_valid_utf8(&bytes_output);
-                    leftover_bytes = leftover;
-                    // XXX: make into chat if chat
-                    if sender.send(StreamSegment::Content(str_output)).is_err() {
-                        panic!("Failed to send");
-                    }
+                    n_batch = batch.token_count();
+                    n_cur += 1;
+
+                    bail!(
+                        context
+                            .llama_decode(&batch)
+                            .map_err(|e| ExecutorError::InnerError(e.into())),
+                        sender
+                    );
                 }
             }
-            if sender
-                .send(StreamSegment::Content(
-                    std::char::REPLACEMENT_CHARACTER
-                        .to_string()
-                        .repeat(leftover_bytes.len()),
-                ))
-                .is_err()
-            {
-                panic!("Failed to send");
-            }
         }); //JoinHandle is dropped? not sure how this works
 
         output
@@ -206,10 +214,11 @@ impl ExecutorTrait for Executor {
             .with_options(&opts_from_env)
             .with_options(&options);
 
-        let (model_path, context_params) = get_executor_initial_opts(&cas)?;
+        let (model_path, model_params, context_params) = get_executor_initial_opts(&cas)?;
         Ok(Self {
             context: Arc::new(Mutex::new(LLamaContext::from_file_and_params(
                 &model_path,
+                Some(&model_params),
                 Some(&context_params),
             )?)),
             options,
@@ -233,18 +242,18 @@ impl ExecutorTrait for Executor {
         let mut tokens_used = tokenizer
             .tokenize_str(&input)
             .map_err(|_e| PromptTokensError::UnableToCompute)?
-            .len() as i32;
+            .len();
         // includes answer_prefix
         let answer_prefix = self.answer_prefix(prompt);
         if let Some(prefix) = answer_prefix {
             let answer_used = tokenizer
                 .tokenize_str(&prefix)
                 .map_err(|_e| PromptTokensError::UnableToCompute)?
-                .len() as i32;
+                .len();
             tokens_used += answer_used
         }
         let max_tokens = self.max_tokens_allowed(options);
-        Ok(TokenCount::new(max_tokens, tokens_used))
+        Ok(TokenCount::new(max_tokens, tokens_used as i32))
     }
 
     fn answer_prefix(&self, prompt: &Prompt) -> Option<String> {
@@ -263,7 +272,7 @@ impl ExecutorTrait for Executor {
     }
 
     fn max_tokens_allowed(&self, _step: &Options) -> i32 {
-        self.context_params.n_ctx
+        self.context_params.n_ctx as i32
     }
 
     fn get_tokenizer(&self, _step: &Options) -> Result<LLamaTokenizer, TokenizerError> {
@@ -289,7 +298,7 @@ impl Tokenizer for LLamaTokenizer<'_> {
     fn tokenize_str(&self, doc: &str) -> Result<TokenCollection, TokenizerError> {
         let tokenized = tokio::task::block_in_place(|| {
             let context = self.context.blocking_lock();
-            tokenize(&context, doc, true)
+            tokenize(&context, doc, true, false)
         });
         Ok(tokenized.into())
     }
@@ -303,34 +312,3 @@ impl Tokenizer for LLamaTokenizer<'_> {
         Ok(output.to_string())
     }
 }
-
-fn decode_up_to_valid_utf8(bytes: &[u8]) -> (String, Vec<u8>) {
-    let (str_output, leftover): (String, Vec<u8>) = match std::str::from_utf8(bytes) {
-        Ok(s) => (s.to_owned(), Vec::new()),
-        Err(unicode_err) => {
-            let index = unicode_err.valid_up_to();
-            let good = &bytes[0..index];
-            match unicode_err.error_len() {
-                None => {
-                    let leftover = bytes[index..].to_vec();
-                    let out = std::str::from_utf8(good).unwrap().to_owned();
-                    (out, leftover)
-                }
-                Some(len) => {
-                    //let bad = &bytes[index..index+len];
-                    //eprintln!("bad utf8: {:?}", bad);
-                    let rest = &bytes[index + len..];
-                    let beggining = std::str::from_utf8(good).unwrap().to_owned();
-                    let (after, leftover) = decode_up_to_valid_utf8(rest);
-
-                    let mut out = beggining;
-                    out.push_str(&std::char::REPLACEMENT_CHARACTER.to_string().repeat(len));
-                    out.push_str(&after);
-
-                    (out, leftover)
-                }
-            }
-        }
-    };
-    (str_output, leftover)
-}
diff --git a/crates/llm-chain-llama/src/lib.rs b/crates/llm-chain-llama/src/lib.rs
index 48b39b19..16222a9d 100644
--- a/crates/llm-chain-llama/src/lib.rs
+++ b/crates/llm-chain-llama/src/lib.rs
@@ -21,13 +21,16 @@
 //!
 //! Happy coding, and enjoy the amazing world of LLMs with llm-chain-llama! 🥳🚀
 
+mod batch;
 mod context;
 mod executor;
+mod model;
 mod options;
 mod tokenizer;
 
 pub use context::ContextParams;
 pub use executor::Executor;
+pub use model::ModelParams;
 
 #[deprecated(note = "Use llm_chain::step::Step instead", since = "0.7.0")]
 pub use llm_chain::step::Step;
diff --git a/crates/llm-chain-llama/src/model.rs b/crates/llm-chain-llama/src/model.rs
new file mode 100644
index 00000000..9a6aba8f
--- /dev/null
+++ b/crates/llm-chain-llama/src/model.rs
@@ -0,0 +1,69 @@
+use llm_chain_llama_sys::{llama_model_default_params, llama_model_params, LLAMA_MAX_DEVICES};
+use std::ptr::null_mut;
+
+// Represents the configuration parameters for a LLama model.
+#[derive(Debug, Clone)]
+pub struct ModelParams {
+    pub n_gpu_layers: i32,
+    pub main_gpu: i32,
+    pub tensor_split: Vec<f32>,
+    pub vocab_only: bool,
+    pub use_mmap: bool,
+    pub use_mlock: bool,
+}
+
+impl ModelParams {
+    pub fn new() -> ModelParams {
+        unsafe { llama_model_default_params() }.into()
+    }
+    // Returns the default parameters or the user-specified parameters.
+    pub(crate) fn or_default(params: Option<&ModelParams>) -> llama_model_params {
+        match params {
+            Some(params) => params.clone().into(),
+            None => unsafe { llama_model_default_params() },
+        }
+    }
+}
+
+impl Default for ModelParams {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl From<ModelParams> for llama_model_params {
+    fn from(params: ModelParams) -> Self {
+        llama_model_params {
+            n_gpu_layers: params.n_gpu_layers,
+            main_gpu: params.main_gpu,
+            tensor_split: params.tensor_split.as_ptr() as *const f32,
+            vocab_only: params.vocab_only,
+            use_mmap: params.use_mmap,
+            use_mlock: params.use_mlock,
+            progress_callback: None,
+            progress_callback_user_data: null_mut(),
+        }
+    }
+}
+
+impl From<llama_model_params> for ModelParams {
+    fn from(params: llama_model_params) -> Self {
+        let tensor_split = unsafe {
+            if params.tensor_split.is_null() {
+                Vec::new()
+            } else {
+                let slice =
+                    std::slice::from_raw_parts(params.tensor_split, LLAMA_MAX_DEVICES as usize);
+                slice.to_vec()
+            }
+        };
+        ModelParams {
+            n_gpu_layers: params.n_gpu_layers,
+            main_gpu: params.main_gpu,
+            tensor_split,
+            vocab_only: params.vocab_only,
+            use_mmap: params.use_mmap,
+            use_mlock: params.use_mlock,
+        }
+    }
+}
diff --git a/crates/llm-chain-llama/src/options.rs b/crates/llm-chain-llama/src/options.rs
index 37b62318..e6bd8750 100644
--- a/crates/llm-chain-llama/src/options.rs
+++ b/crates/llm-chain-llama/src/options.rs
@@ -9,10 +9,10 @@ use llm_chain::{
 use std::collections::HashMap;
 
 use crate::context::ContextParams;
+use crate::model::ModelParams;
 
 /// Represents a concrete call to the LLM model, with all the parameters specified, and no implicit behavior.
 pub struct LlamaInvocation {
-    pub(crate) n_threads: i32,
     pub(crate) n_tok_predict: usize,
     pub(crate) logit_bias: HashMap<i32, f32>,
     pub(crate) top_k: i32,
@@ -49,7 +49,6 @@ impl LlamaInvocation {
         opt: OptionsCascade,
         prompt: &Prompt,
     ) -> Result<LlamaInvocation, ExecutorCreationError> {
-        let n_threads = opt_extract!(opt, n_threads, NThreads)?;
         let n_tok_predict = opt_extract!(opt, n_tok_predict, MaxTokens)?;
         let top_k = opt_extract!(opt, top_k, TopK)?;
         let top_p = opt_extract!(opt, top_p, TopP)?;
@@ -70,7 +69,6 @@ impl LlamaInvocation {
         let logit_bias = HashMap::<i32, f32>::new(); // token_bias.as_i32_f32_hashmap()?;
 
         Ok(LlamaInvocation {
-            n_threads: *n_threads as i32,
             n_tok_predict: *n_tok_predict,
             logit_bias,
             top_k: *top_k,
@@ -97,6 +95,7 @@ lazy_static! {
         // ModelType: "llama", // not used
         NThreads: 1_usize,
         MaxTokens: 0_usize,
+        MaxBatchSize: 512_usize,
         MaxContextSize: 2048_usize,
         TopK: 40_i32,
         TopP: 0.95,
@@ -111,18 +110,58 @@ lazy_static! {
         MirostatTau: 5.0,
         MirostatEta: 0.1,
         PenalizeNl: true,
-        StopSequence: vec!["\n\n".to_string()]
+        StopSequence: vec!["\n\n".to_string()],
+        NGpuLayers: 0_i32,
+        MainGpu: 0_i32,
+        TensorSplit: Vec::new(),
+        VocabOnly: false,
+        UseMmap: true,
+        UseMlock: false
     );
 }
 
 pub(crate) fn get_executor_initial_opts(
     opt: &OptionsCascade,
-) -> Result<(String, ContextParams), ExecutorCreationError> {
+) -> Result<(String, ModelParams, ContextParams), ExecutorCreationError> {
     let model = opt_extract!(opt, model, Model)?;
-    let max_context_size = opt_extract!(opt, max_context_size, MaxContextSize)?;
+
+    let mut mp = ModelParams::new();
+    if let Some(Opt::NGpuLayers(value)) = opt.get(OptDiscriminants::NGpuLayers) {
+        mp.n_gpu_layers = *value;
+    }
+    if let Some(Opt::MainGpu(value)) = opt.get(OptDiscriminants::MainGpu) {
+        mp.main_gpu = *value;
+    }
+    if let Some(Opt::TensorSplit(values)) = opt.get(OptDiscriminants::TensorSplit) {
+        mp.tensor_split = values.clone();
+    }
+    // Currently, the setting of vocab_only is not allowed as it will cause
+    // a crash when using the llama executor which needs to have wieghts loaded
+    // in order to work.
+    mp.vocab_only = false;
+
+    if let Some(Opt::UseMmap(value)) = opt.get(OptDiscriminants::UseMmap) {
+        mp.use_mmap = *value;
+    }
+    if let Some(Opt::UseMlock(value)) = opt.get(OptDiscriminants::UseMlock) {
+        mp.use_mlock = *value;
+    }
 
     let mut cp = ContextParams::new();
-    cp.n_ctx = *max_context_size as i32;
+    if let Some(Opt::NThreads(value)) = opt.get(OptDiscriminants::NThreads) {
+        cp.n_threads = *value as u32;
+    }
+
+    let max_context_size = opt_extract!(opt, max_context_size, MaxContextSize)?;
+    cp.n_ctx = *max_context_size as u32;
+
+    let n_batch = opt_extract!(opt, nbatch, MaxBatchSize)?;
+    cp.n_batch = *n_batch as u32;
+    if max_context_size < n_batch {
+        return Err(ExecutorCreationError::InvalidValue(
+            "MaxBatchSize must be less than or equal to MaxContextSize".to_string(),
+        ));
+    }
 
-    Ok((model.to_path(), cp))
+    Ok((model.to_path(), mp, cp))
 }
diff --git a/crates/llm-chain-llama/src/tokenizer.rs b/crates/llm-chain-llama/src/tokenizer.rs
index 6e12ce97..373ade31 100644
--- a/crates/llm-chain-llama/src/tokenizer.rs
+++ b/crates/llm-chain-llama/src/tokenizer.rs
@@ -3,9 +3,7 @@ use llm_chain::prompt::Data;
 use std::ffi::{CStr, CString};
 use std::os::raw::c_char;
 
-use llm_chain_llama_sys::{
-    llama_token, llama_token_eos as inner_eos, llama_token_to_str, llama_tokenize,
-};
+use llm_chain_llama_sys::{llama_token, llama_token_get_text, llama_tokenize};
 
 use crate::context::LLamaContext;
 
@@ -25,17 +23,13 @@ fn to_cstring(s: &str) -> CString {
 ///
 /// A Rust String representation of the given llama_token.
 fn to_output(context: &LLamaContext, token: i32) -> String {
-    let c_ptr = unsafe { llama_token_to_str(**context, token) };
+    let c_ptr = unsafe { llama_token_get_text(context.model, token) };
     let native_string = unsafe { CStr::from_ptr(c_ptr) }
         .to_string_lossy()
         .into_owned();
     native_string
 }
 
-pub fn llama_token_eos() -> i32 {
-    unsafe { inner_eos() }
-}
-
 /// Helper function to tokenize text using the provided LLamaContext and add_bos option.
 ///
 /// # Arguments
@@ -47,21 +41,42 @@ pub fn llama_token_eos() -> i32 {
 /// # Returns
 ///
 /// A Vec of llama_tokens representing the tokenized input.
-pub(crate) fn tokenize(context: &LLamaContext, text: &str, add_bos: bool) -> Vec<llama_token> {
+pub(crate) fn tokenize(
+    context: &LLamaContext,
+    text: &str,
+    add_bos: bool,
+    special: bool,
+) -> Vec<llama_token> {
     let mut res = Vec::with_capacity(text.len() + add_bos as usize);
     let c_text = to_cstring(text);
-
-    let n = unsafe {
+    let n_tokens = unsafe {
         llama_tokenize(
-            **context,
+            context.model,
             c_text.as_ptr() as *const c_char,
+            c_text.to_bytes().len() as i32,
             res.as_mut_ptr(),
             res.capacity() as i32,
             add_bos,
+            special,
         )
     };
-    assert!(n >= 0);
-    unsafe { res.set_len(n as usize) };
+    if n_tokens < 0 {
+        res.resize(-n_tokens as usize, 0);
+        let new_n_tokens = unsafe {
+            llama_tokenize(
+                context.model,
+                c_text.as_ptr() as *const c_char,
+                c_text.to_bytes().len() as i32,
+                res.as_mut_ptr(),
+                res.capacity() as i32,
+                add_bos,
+                special,
+            )
+        };
+        assert!(new_n_tokens == -n_tokens);
+    } else {
+        unsafe { res.set_len(n_tokens as usize) };
+    }
     res
 }
 
diff --git a/crates/llm-chain/src/options.rs b/crates/llm-chain/src/options.rs
index ee180638..bdef2cb4 100644
--- a/crates/llm-chain/src/options.rs
+++ b/crates/llm-chain/src/options.rs
@@ -341,6 +341,9 @@ pub enum Opt {
     MaxTokens(usize),
     /// The maximum context size of the model.
     MaxContextSize(usize),
+    /// The maximum batch size of the model.
+    /// This is used by llama models.
+    MaxBatchSize(usize),
     /// The sequences that, when encountered, will cause the model to stop generating further tokens.
     /// OpenAI models allow up to four stop sequences.
     StopSequence(Vec<String>),
@@ -394,6 +397,19 @@ pub enum Opt {
     User(String),
     /// The type of the model.
     ModelType(String),
+
+    // The number of layers to be stored in GPU VRAM for llm-chain-llama.
+    NGpuLayers(i32),
+    // The GPU that should be used for scratch and small tensors for llm-chain-llama.
+    MainGpu(i32),
+    // How the layers should be split accross the available GPUs for llm-chain-llama.
+    TensorSplit(Vec<f32>),
+    // Only load the vocabulary for llm-chain-llama, no weights will be loaded.
+    VocabOnly(bool),
+    // Use memory mapped files for llm-chain-llama where possible.
+    UseMmap(bool),
+    // Force the system to keep the model in memory for llm-chain-llama.
+    UseMlock(bool),
 }
 
 // Helper function to extract environment variables
diff --git a/crates/llm-chain/src/traits.rs b/crates/llm-chain/src/traits.rs
index b98b5c36..65bc4284 100644
--- a/crates/llm-chain/src/traits.rs
+++ b/crates/llm-chain/src/traits.rs
@@ -27,6 +27,8 @@ pub enum ExecutorCreationError {
     InnerError(#[from] Box<dyn Error + Send + Sync>),
     #[error("Field must be set: {0}")]
     FieldRequiredError(String),
+    #[error("Invalid value. {0}")]
+    InvalidValue(String),
 }
 
 #[derive(thiserror::Error, Debug)]