diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml
index 9c8db2b0..a0207b0d 100644
--- a/.github/workflows/cicd.yaml
+++ b/.github/workflows/cicd.yaml
@@ -36,10 +36,12 @@ jobs:
 
   build_and_test:
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-13, windows-latest]
         rust-version: [stable]
     runs-on: ${{ matrix.os }}
+    timeout-minutes: 30
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
diff --git a/Cargo.lock b/Cargo.lock
index a117b184..dfe6f9eb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,15 @@ checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217"
 
 [[package]]
 name = "ahash"
-version = "0.4.8"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0453232ace82dee0dd0b4c87a59bd90f7b53b314f3e0f61fe2ee7c8a16482289"
+checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
 
 [[package]]
 name = "aho-corasick"
@@ -38,6 +44,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
 [[package]]
 name = "android-tzdata"
 version = "0.1.1"
@@ -662,6 +674,12 @@ version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6"
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "bytes"
 version = "1.5.0"
@@ -748,6 +766,16 @@ dependencies = [
  "libloading",
 ]
 
+[[package]]
+name = "combine"
+version = "4.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
 [[package]]
 name = "core-foundation"
 version = "0.9.3"
@@ -803,7 +831,7 @@ dependencies = [
  "autocfg",
  "cfg-if",
  "crossbeam-utils",
- "memoffset",
+ "memoffset 0.9.0",
  "scopeguard",
 ]
 
@@ -1008,6 +1036,18 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "enum-as-inner"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.39",
+]
+
 [[package]]
 name = "env_logger"
 version = "0.10.1"
@@ -1336,15 +1376,6 @@ dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "hashbrown"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
-dependencies = [
- "ahash 0.4.8",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1356,6 +1387,10 @@ name = "hashbrown"
 version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156"
+dependencies = [
+ "ahash 0.8.6",
+ "allocator-api2",
+]
 
 [[package]]
 name = "heck"
@@ -1386,16 +1421,18 @@ dependencies = [
 
 [[package]]
 name = "hnsw_rs"
-version = "0.1.19"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0308727459701f2fa18286e50662c37044e130e955bfd42b6ff30260116b2a5"
+checksum = "baf40f00346c339c8181f485ef409e49412649cde8e318cc6804849841ad85f1"
 dependencies = [
+ "anyhow",
  "bincode",
  "cpu-time",
  "env_logger",
- "hashbrown 0.9.1",
+ "hashbrown 0.14.2",
  "lazy_static",
  "log",
+ "mmap-rs",
  "num-traits",
  "num_cpus",
  "parking_lot",
@@ -1972,6 +2009,15 @@ version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
+[[package]]
+name = "mach2"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d0d1830bcd151a6fc4aea1369af235b36c1528fe976b8ff678683c9995eade8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "markdown"
 version = "1.0.0-alpha.14"
@@ -2002,6 +2048,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "memoffset"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.9.0"
@@ -2072,6 +2127,23 @@ dependencies = [
  "windows-sys",
 ]
 
+[[package]]
+name = "mmap-rs"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e1af4ac2b44e6faa5d82a400349ccf8444d68559eca4c6f976befc4eee963da"
+dependencies = [
+ "bitflags 1.3.2",
+ "combine",
+ "libc",
+ "mach2",
+ "nix",
+ "sysctl",
+ "thiserror",
+ "widestring",
+ "windows",
+]
+
 [[package]]
 name = "mockall"
 version = "0.11.4"
@@ -2123,6 +2195,19 @@ dependencies = [
  "tempfile",
 ]
 
+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset 0.7.1",
+ "pin-utils",
+]
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3270,6 +3355,20 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
+[[package]]
+name = "sysctl"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "enum-as-inner",
+ "libc",
+ "thiserror",
+ "walkdir",
+]
+
 [[package]]
 name = "system-configuration"
 version = "0.5.1"
@@ -4003,6 +4102,12 @@ dependencies = [
  "rustix",
 ]
 
+[[package]]
+name = "widestring"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "653f141f39ec16bba3c5abe400a0c60da7468261cc2cbf36805022876bc721a8"
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -4034,6 +4139,15 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.51.1"
@@ -4125,6 +4239,26 @@ version = "0.13.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
 
+[[package]]
+name = "zerocopy"
+version = "0.7.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97e415490559a91254a2979b4829267a57d2fcd741a98eee8b722fb57289aa0"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd7e48ccf166952882ca8bd778a43502c64f33bf94c12ebe2a7f08e5a0f6689f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.39",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.6.0"
diff --git a/crates/llm-chain-hnsw/Cargo.toml b/crates/llm-chain-hnsw/Cargo.toml
index 703160f7..5f5f9874 100644
--- a/crates/llm-chain-hnsw/Cargo.toml
+++ b/crates/llm-chain-hnsw/Cargo.toml
@@ -14,7 +14,7 @@ repository = "https://github.com/sobelio/llm-chain/"
 
 [dependencies]
 async-trait.workspace = true
-hnsw_rs = "0.1.19"
+hnsw_rs = "0.2"
 llm-chain = { path = "../llm-chain", version = "0.13.0", default-features = false }
 serde.workspace = true
 serde_json.workspace = true
diff --git a/crates/llm-chain-hnsw/examples/dump_load.rs b/crates/llm-chain-hnsw/examples/dump_load.rs
index 58204272..4922de1d 100644
--- a/crates/llm-chain-hnsw/examples/dump_load.rs
+++ b/crates/llm-chain-hnsw/examples/dump_load.rs
@@ -1,3 +1,5 @@
+use hnsw_rs::{hnswio::*, prelude::*};
+use std::path::PathBuf;
 use std::sync::Arc;
 
 use llm_chain::{
@@ -16,7 +18,7 @@ async fn main() {
     let hnsw_index_fn = "hnsw_index".to_string();
     let mut embeddings = llm_chain_openai::embeddings::Embeddings::default();
     let document_store = Arc::new(Mutex::new(InMemoryDocumentStore::<EmptyMetadata>::new()));
-    let mut hnsw_vs = HnswVectorStore::new(
+    let hnsw_vs = HnswVectorStore::new(
         HnswArgs::default(),
         Arc::new(embeddings),
         document_store.clone(),
@@ -56,12 +58,13 @@ async fn main() {
     // Load
     println!("Loading hnsw index from file");
     embeddings = llm_chain_openai::embeddings::Embeddings::default();
-    hnsw_vs = HnswVectorStore::load_from_file(
-        hnsw_index_fn,
-        Arc::new(embeddings),
-        document_store.clone(),
-    )
-    .unwrap();
+
+    let mut hnswio = HnswIo::new(PathBuf::from("."), hnsw_index_fn);
+    let hnsw_loaded = hnswio.load_hnsw::<f32, DistCosine>().unwrap();
+    let hnsw_vs =
+        HnswVectorStore::load_from_file(hnsw_loaded, Arc::new(embeddings), document_store.clone())
+            .unwrap();
+
     println!("Loaded!");
 
     let response = hnsw_vs
diff --git a/crates/llm-chain-hnsw/src/lib.rs b/crates/llm-chain-hnsw/src/lib.rs
index 44661011..87f9d54d 100644
--- a/crates/llm-chain-hnsw/src/lib.rs
+++ b/crates/llm-chain-hnsw/src/lib.rs
@@ -1,10 +1,7 @@
-use std::{
-    collections::HashMap, fs::OpenOptions, io::BufReader, marker::PhantomData, path::PathBuf,
-    sync::Arc,
-};
+use std::{collections::HashMap, marker::PhantomData, sync::Arc};
 
 use async_trait::async_trait;
-use hnsw_rs::{hnsw::Hnsw, hnswio::*, prelude::*};
+use hnsw_rs::{hnsw::Hnsw, prelude::*};
 use llm_chain::{
     document_stores::document_store::*,
     schema::Document,
@@ -32,19 +29,19 @@ impl Default for HnswArgs {
     }
 }
 
-pub struct HnswVectorStore<E, D, M>
+pub struct HnswVectorStore<'a, E, D, M>
 where
     E: Embeddings,
     D: DocumentStore<usize, M> + Send + Sync,
     M: Serialize + DeserializeOwned + Send + Sync,
 {
-    hnsw: Arc<Hnsw<f32, DistCosine>>,
+    hnsw: Arc<Hnsw<'a, f32, DistCosine>>,
     document_store: Arc<Mutex<D>>,
     embeddings: Arc<E>,
     _marker: PhantomData<M>,
 }
 
-impl<E, D, M> HnswVectorStore<E, D, M>
+impl<'a, E, D, M> HnswVectorStore<'a, E, D, M>
 where
     E: Embeddings,
     D: DocumentStore<usize, M> + Send + Sync,
@@ -69,47 +66,20 @@ where
     pub fn dump_to_file(
         &self,
         filename: String,
-    ) -> Result<i32, HnswVectorStoreError<E::Error, D::Error>> {
+    ) -> Result<String, HnswVectorStoreError<E::Error, D::Error>> {
         self.hnsw
             .file_dump(&filename)
-            .map_err(HnswVectorStoreError::FileDumpError)
+            .map_err(|e| HnswVectorStoreError::FileDumpError(e.to_string()))
     }
 
     pub fn load_from_file(
-        filename: String,
+        hnsw: Hnsw<'a, f32, DistCosine>,
         embeddings: Arc<E>,
         document_store: Arc<Mutex<D>>,
-    ) -> Result<Self, HnswVectorStoreError<E::Error, D::Error>> {
-        let graph_fn = format!("{}.hnsw.graph", &filename);
-        let graph_path = PathBuf::from(graph_fn);
-        let graph_file_res = OpenOptions::new().read(true).open(&graph_path);
-        if graph_file_res.is_err() {
-            return Err(HnswVectorStoreError::FileLoadError(format!(
-                "could not open file {:?}",
-                graph_path.as_os_str()
-            )));
-        }
-        let graph_file = graph_file_res.unwrap();
-        let data_fn = format!("{}.hnsw.data", &filename);
-        let data_path = PathBuf::from(data_fn);
-        let data_file_res = OpenOptions::new().read(true).open(&data_path);
-        if data_file_res.is_err() {
-            return Err(HnswVectorStoreError::FileLoadError(format!(
-                "could not open file {:?}",
-                data_path.as_os_str()
-            )));
-        }
-        let data_file = data_file_res.unwrap();
-
-        let mut graph_in = BufReader::new(graph_file);
-        let mut data_in = BufReader::new(data_file);
-
-        let hnsw_description = load_description(&mut graph_in).unwrap();
-        let hnsw_loaded: Hnsw<f32, DistCosine> =
-            load_hnsw(&mut graph_in, &hnsw_description, &mut data_in).unwrap();
-
+    ) -> Result<Self, HnswVectorStoreError<E::Error, D::Error>>
+where {
         Ok(HnswVectorStore {
-            hnsw: Arc::new(hnsw_loaded),
+            hnsw: Arc::new(hnsw),
             document_store,
             embeddings,
             _marker: Default::default(),
@@ -143,7 +113,7 @@ where
 }
 
 #[async_trait]
-impl<E, D, M> VectorStore<E, M> for HnswVectorStore<E, D, M>
+impl<'a, E, D, M> VectorStore<E, M> for HnswVectorStore<'a, E, D, M>
 where
     E: Embeddings + Send + Sync,
     D: DocumentStore<usize, M> + Send + Sync,
diff --git a/crates/llm-chain-llama-sys/README.md b/crates/llm-chain-llama-sys/README.md
index 66f56b64..fd5c31d4 100644
--- a/crates/llm-chain-llama-sys/README.md
+++ b/crates/llm-chain-llama-sys/README.md
@@ -12,3 +12,10 @@ use llama_sys::\*;
 ```
 
 Note that llama-sys provides a lower-level interface than llm-chain-llama, and may be more difficult to use. However, if you need fine-grained control over llama.cpp, llama-sys is the way to go.
+
+## Updating llama.cpp submodule
+To update the llama.cpp submodule, run the following command:
+
+```console
+$ git submodule update --remote --merge llama.cpp
+```
diff --git a/crates/llm-chain-llama-sys/build.rs b/crates/llm-chain-llama-sys/build.rs
index e85f682d..270200eb 100644
--- a/crates/llm-chain-llama-sys/build.rs
+++ b/crates/llm-chain-llama-sys/build.rs
@@ -49,6 +49,9 @@ fn main() {
                 let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
                 b.write_to_file(out_path.join("bindings.rs"))
                     .expect("Couldn't write bindings!");
+                let out_path = PathBuf::from("src");
+                b.write_to_file(out_path.join("bindings.rs"))
+                    .expect("Couldn't write binding to src directorys!");
             }
             Err(e) => {
                 println!("cargo:warning=Unable to generate bindings: {}", e);
@@ -85,7 +88,9 @@ fn main() {
         .arg("-DLLAMA_ALL_WARNINGS=OFF")
         .arg("-DLLAMA_ALL_WARNINGS_3RD_PARTY=OFF")
         .arg("-DLLAMA_BUILD_TESTS=OFF")
-        .arg("-DLLAMA_BUILD_EXAMPLES=OFF");
+        .arg("-DLLAMA_BUILD_EXAMPLES=OFF")
+        .arg("-DLLAMA_NO_METAL=ON")
+        .arg("-DLLAMA_METAL=OFF");
     // .arg("-DLLAMA_STATIC=ON")
     if cuda_enabled {
         // If CUDA feature is enabled, build with cuBlAS to enable GPU acceleration
diff --git a/crates/llm-chain-llama-sys/llama.cpp b/crates/llm-chain-llama-sys/llama.cpp
index 173d0e64..e4b76bbe 160000
--- a/crates/llm-chain-llama-sys/llama.cpp
+++ b/crates/llm-chain-llama-sys/llama.cpp
@@ -1 +1 @@
-Subproject commit 173d0e6419e8f8f3c1f4f13201b777f4c60629f3
+Subproject commit e4b76bbe316ee50fb17d9ac29e654c0edf830eba
diff --git a/crates/llm-chain-llama-sys/src/bindings.rs b/crates/llm-chain-llama-sys/src/bindings.rs
index eae6618f..2ce37af8 100644
--- a/crates/llm-chain-llama-sys/src/bindings.rs
+++ b/crates/llm-chain-llama-sys/src/bindings.rs
@@ -52,7 +52,7 @@ pub const __STDC_IEC_60559_COMPLEX__: u32 = 201404;
 pub const __STDC_ISO_10646__: u32 = 201706;
 pub const __GNU_LIBRARY__: u32 = 6;
 pub const __GLIBC__: u32 = 2;
-pub const __GLIBC_MINOR__: u32 = 37;
+pub const __GLIBC_MINOR__: u32 = 36;
 pub const _SYS_CDEFS_H: u32 = 1;
 pub const __glibc_c99_flexarr_available: u32 = 1;
 pub const __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI: u32 = 0;
@@ -152,30 +152,76 @@ pub const GGML_FILE_VERSION: u32 = 1;
 pub const GGML_QNT_VERSION: u32 = 2;
 pub const GGML_QNT_VERSION_FACTOR: u32 = 1000;
 pub const GGML_MAX_DIMS: u32 = 4;
-pub const GGML_MAX_NODES: u32 = 4096;
-pub const GGML_MAX_PARAMS: u32 = 256;
+pub const GGML_MAX_PARAMS: u32 = 1024;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
 pub const GGML_MAX_SRC: u32 = 6;
-pub const GGML_MAX_NAME: u32 = 48;
-pub const GGML_MAX_OP_PARAMS: u32 = 32;
+pub const GGML_MAX_NAME: u32 = 64;
+pub const GGML_MAX_OP_PARAMS: u32 = 64;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
+pub const GGML_DEFAULT_GRAPH_SIZE: u32 = 2048;
+pub const GGML_MEM_ALIGN: u32 = 16;
 pub const GGML_EXIT_SUCCESS: u32 = 0;
 pub const GGML_EXIT_ABORTED: u32 = 1;
-pub const GGML_GRAPH_HASHTABLE_SIZE: u32 = 8273;
-pub const GGML_CUDA_MAX_DEVICES: u32 = 16;
-pub const LLAMA_MAX_DEVICES: u32 = 16;
-pub const LLAMA_FILE_MAGIC_GGJT: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_GGLA: u32 = 1734831201;
-pub const LLAMA_FILE_MAGIC_GGMF: u32 = 1734831462;
-pub const LLAMA_FILE_MAGIC_GGML: u32 = 1734831468;
+pub const GGUF_MAGIC: &[u8; 5] = b"GGUF\0";
+pub const GGUF_VERSION: u32 = 3;
+pub const GGUF_DEFAULT_ALIGNMENT: u32 = 32;
+pub const GGML_N_TASKS_MAX: i32 = -1;
+pub const LLAMA_MAX_DEVICES: u32 = 1;
+pub const _STDIO_H: u32 = 1;
+pub const __GNUC_VA_LIST: u32 = 1;
+pub const _____fpos_t_defined: u32 = 1;
+pub const ____mbstate_t_defined: u32 = 1;
+pub const _____fpos64_t_defined: u32 = 1;
+pub const ____FILE_defined: u32 = 1;
+pub const __FILE_defined: u32 = 1;
+pub const __struct_FILE_defined: u32 = 1;
+pub const _IO_EOF_SEEN: u32 = 16;
+pub const _IO_ERR_SEEN: u32 = 32;
+pub const _IO_USER_LOCK: u32 = 32768;
+pub const __cookie_io_functions_t_defined: u32 = 1;
+pub const _IOFBF: u32 = 0;
+pub const _IOLBF: u32 = 1;
+pub const _IONBF: u32 = 2;
+pub const BUFSIZ: u32 = 8192;
+pub const EOF: i32 = -1;
+pub const SEEK_SET: u32 = 0;
+pub const SEEK_CUR: u32 = 1;
+pub const SEEK_END: u32 = 2;
+pub const SEEK_DATA: u32 = 3;
+pub const SEEK_HOLE: u32 = 4;
+pub const P_tmpdir: &[u8; 5] = b"/tmp\0";
+pub const _BITS_STDIO_LIM_H: u32 = 1;
+pub const L_tmpnam: u32 = 20;
+pub const TMP_MAX: u32 = 238328;
+pub const FILENAME_MAX: u32 = 4096;
+pub const L_ctermid: u32 = 9;
+pub const L_cuserid: u32 = 9;
+pub const FOPEN_MAX: u32 = 16;
+pub const _PRINTF_NAN_LEN_MAX: u32 = 4;
+pub const RENAME_NOREPLACE: u32 = 1;
+pub const RENAME_EXCHANGE: u32 = 2;
+pub const RENAME_WHITEOUT: u32 = 4;
+pub const __HAVE_FLOAT128: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT128: u32 = 0;
+pub const __HAVE_FLOAT64X: u32 = 1;
+pub const __HAVE_FLOAT64X_LONG_DOUBLE: u32 = 1;
+pub const __HAVE_FLOAT16: u32 = 0;
+pub const __HAVE_FLOAT32: u32 = 1;
+pub const __HAVE_FLOAT64: u32 = 1;
+pub const __HAVE_FLOAT32X: u32 = 1;
+pub const __HAVE_FLOAT128X: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT16: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT32: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT64: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT32X: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT64X: u32 = 0;
+pub const __HAVE_DISTINCT_FLOAT128X: u32 = 0;
+pub const __HAVE_FLOATN_NOT_TYPEDEF: u32 = 0;
+pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
+pub const LLAMA_MAX_RNG_STATE: u32 = 65536;
 pub const LLAMA_FILE_MAGIC_GGSN: u32 = 1734833006;
-pub const LLAMA_FILE_VERSION: u32 = 3;
-pub const LLAMA_FILE_MAGIC: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468;
 pub const LLAMA_SESSION_MAGIC: u32 = 1734833006;
-pub const LLAMA_SESSION_VERSION: u32 = 1;
-pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
-pub const LLAMA_DEFAULT_RMS_EPS: f64 = 0.000005;
+pub const LLAMA_SESSION_VERSION: u32 = 2;
 pub type __u_char = ::std::os::raw::c_uchar;
 pub type __u_short = ::std::os::raw::c_ushort;
 pub type __u_int = ::std::os::raw::c_uint;
@@ -366,10 +412,10 @@ pub const ggml_type_GGML_TYPE_I16: ggml_type = 17;
 pub const ggml_type_GGML_TYPE_I32: ggml_type = 18;
 pub const ggml_type_GGML_TYPE_COUNT: ggml_type = 19;
 pub type ggml_type = ::std::os::raw::c_uint;
-pub const ggml_backend_GGML_BACKEND_CPU: ggml_backend = 0;
-pub const ggml_backend_GGML_BACKEND_GPU: ggml_backend = 10;
-pub const ggml_backend_GGML_BACKEND_GPU_SPLIT: ggml_backend = 20;
-pub type ggml_backend = ::std::os::raw::c_uint;
+pub const ggml_backend_type_GGML_BACKEND_CPU: ggml_backend_type = 0;
+pub const ggml_backend_type_GGML_BACKEND_GPU: ggml_backend_type = 10;
+pub const ggml_backend_type_GGML_BACKEND_GPU_SPLIT: ggml_backend_type = 20;
+pub type ggml_backend_type = ::std::os::raw::c_uint;
 pub const ggml_ftype_GGML_FTYPE_UNKNOWN: ggml_ftype = -1;
 pub const ggml_ftype_GGML_FTYPE_ALL_F32: ggml_ftype = 0;
 pub const ggml_ftype_GGML_FTYPE_MOSTLY_F16: ggml_ftype = 1;
@@ -402,49 +448,58 @@ pub const ggml_op_GGML_OP_MEAN: ggml_op = 13;
 pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14;
 pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15;
 pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 17;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 20;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 21;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 22;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 23;
-pub const ggml_op_GGML_OP_SET: ggml_op = 24;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 25;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 26;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 27;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 28;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 29;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 30;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 31;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 32;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 33;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 34;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 35;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 36;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 37;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 38;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 39;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 40;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 41;
-pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 42;
-pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 43;
-pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 44;
-pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 45;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 46;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 47;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 48;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 49;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 50;
-pub const ggml_op_GGML_OP_UNARY: ggml_op = 51;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 52;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 53;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 54;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 55;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 56;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 57;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 58;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 59;
+pub const ggml_op_GGML_OP_CONCAT: ggml_op = 17;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 18;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 19;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 20;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 21;
+pub const ggml_op_GGML_OP_GROUP_NORM: ggml_op = 22;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 23;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 24;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 25;
+pub const ggml_op_GGML_OP_SET: ggml_op = 26;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 27;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 28;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 29;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 30;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 31;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 32;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 33;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 34;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 35;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 36;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 37;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 38;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 39;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 40;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 41;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 42;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 43;
+pub const ggml_op_GGML_OP_CONV_TRANSPOSE_1D: ggml_op = 44;
+pub const ggml_op_GGML_OP_IM2COL: ggml_op = 45;
+pub const ggml_op_GGML_OP_CONV_TRANSPOSE_2D: ggml_op = 46;
+pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 47;
+pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 48;
+pub const ggml_op_GGML_OP_UPSCALE: ggml_op = 49;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 50;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 51;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 52;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 53;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 54;
+pub const ggml_op_GGML_OP_GET_REL_POS: ggml_op = 55;
+pub const ggml_op_GGML_OP_ADD_REL_POS: ggml_op = 56;
+pub const ggml_op_GGML_OP_UNARY: ggml_op = 57;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 58;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 59;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1_F32: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2_F32: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3_F32: ggml_op = 62;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 68;
 pub type ggml_op = ::std::os::raw::c_uint;
 pub const ggml_unary_op_GGML_UNARY_OP_ABS: ggml_unary_op = 0;
 pub const ggml_unary_op_GGML_UNARY_OP_SGN: ggml_unary_op = 1;
@@ -456,11 +511,16 @@ pub const ggml_unary_op_GGML_UNARY_OP_RELU: ggml_unary_op = 6;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU: ggml_unary_op = 7;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU_QUICK: ggml_unary_op = 8;
 pub const ggml_unary_op_GGML_UNARY_OP_SILU: ggml_unary_op = 9;
+pub const ggml_unary_op_GGML_UNARY_OP_LEAKY: ggml_unary_op = 10;
 pub type ggml_unary_op = ::std::os::raw::c_uint;
 pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
 pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
 pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
 pub type ggml_object_type = ::std::os::raw::c_uint;
+pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
+pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
+pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
+pub type ggml_log_level = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
@@ -540,22 +600,25 @@ pub const GGML_OBJECT_SIZE: usize = 32;
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_tensor {
     pub type_: ggml_type,
-    pub backend: ggml_backend,
+    pub backend: ggml_backend_type,
+    pub buffer: *mut ggml_backend_buffer,
     pub n_dims: ::std::os::raw::c_int,
     pub ne: [i64; 4usize],
     pub nb: [usize; 4usize],
     pub op: ggml_op,
-    pub op_params: [i32; 8usize],
+    pub op_params: [i32; 16usize],
     pub is_param: bool,
     pub grad: *mut ggml_tensor,
     pub src: [*mut ggml_tensor; 6usize],
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
+    pub view_src: *mut ggml_tensor,
+    pub view_offs: usize,
     pub data: *mut ::std::os::raw::c_void,
-    pub name: [::std::os::raw::c_char; 48usize],
+    pub name: [::std::os::raw::c_char; 64usize],
     pub extra: *mut ::std::os::raw::c_void,
-    pub padding: [::std::os::raw::c_char; 4usize],
+    pub padding: [::std::os::raw::c_char; 12usize],
 }
 #[test]
 fn bindgen_test_layout_ggml_tensor() {
@@ -563,7 +626,7 @@ fn bindgen_test_layout_ggml_tensor() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_tensor>(),
-        272usize,
+        352usize,
         concat!("Size of: ", stringify!(ggml_tensor))
     );
     assert_eq!(
@@ -592,8 +655,18 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_dims) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).buffer) as usize - ptr as usize },
         8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(buffer)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_dims) as usize - ptr as usize },
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -603,7 +676,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).ne) as usize - ptr as usize },
-        16usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -613,7 +686,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nb) as usize - ptr as usize },
-        48usize,
+        56usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -623,7 +696,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).op) as usize - ptr as usize },
-        80usize,
+        88usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -633,7 +706,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).op_params) as usize - ptr as usize },
-        84usize,
+        92usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -643,7 +716,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize },
-        116usize,
+        156usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -653,7 +726,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).grad) as usize - ptr as usize },
-        120usize,
+        160usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -663,7 +736,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).src) as usize - ptr as usize },
-        128usize,
+        168usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -673,7 +746,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        176usize,
+        216usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -683,7 +756,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        184usize,
+        224usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -693,7 +766,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        192usize,
+        232usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -701,9 +774,29 @@ fn bindgen_test_layout_ggml_tensor() {
             stringify!(perf_time_us)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).view_src) as usize - ptr as usize },
+        240usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_src)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).view_offs) as usize - ptr as usize },
+        248usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_offs)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
-        200usize,
+        256usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -713,7 +806,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).name) as usize - ptr as usize },
-        208usize,
+        264usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -723,7 +816,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize },
-        256usize,
+        328usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -733,7 +826,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        264usize,
+        336usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -742,14 +835,13 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
 }
-pub const GGML_TENSOR_SIZE: usize = 272;
+pub const GGML_TENSOR_SIZE: usize = 352;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cplan {
     pub work_size: usize,
     pub work_data: *mut u8,
     pub n_threads: ::std::os::raw::c_int,
-    pub n_tasks: [::std::os::raw::c_int; 4096usize],
     pub abort_callback:
         ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void) -> bool>,
     pub abort_callback_data: *mut ::std::os::raw::c_void,
@@ -760,7 +852,7 @@ fn bindgen_test_layout_ggml_cplan() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_cplan>(),
-        16424usize,
+        40usize,
         concat!("Size of: ", stringify!(ggml_cplan))
     );
     assert_eq!(
@@ -799,45 +891,82 @@ fn bindgen_test_layout_ggml_cplan() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize },
-        20usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback) as usize - ptr as usize },
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cplan),
             "::",
-            stringify!(n_tasks)
+            stringify!(abort_callback)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback) as usize - ptr as usize },
-        16408usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback_data) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cplan),
             "::",
-            stringify!(abort_callback)
+            stringify!(abort_callback_data)
         )
     );
+}
+pub const ggml_cgraph_eval_order_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT: ggml_cgraph_eval_order = 0;
+pub const ggml_cgraph_eval_order_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT: ggml_cgraph_eval_order = 1;
+pub const ggml_cgraph_eval_order_GGML_CGRAPH_EVAL_ORDER_COUNT: ggml_cgraph_eval_order = 2;
+pub type ggml_cgraph_eval_order = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_hash_set {
+    pub size: usize,
+    pub keys: *mut *mut ggml_tensor,
+}
+#[test]
+fn bindgen_test_layout_ggml_hash_set() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_hash_set> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).abort_callback_data) as usize - ptr as usize },
-        16416usize,
+        ::std::mem::size_of::<ggml_hash_set>(),
+        16usize,
+        concat!("Size of: ", stringify!(ggml_hash_set))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_hash_set>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_hash_set))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        0usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_cplan),
+            stringify!(ggml_hash_set),
             "::",
-            stringify!(abort_callback_data)
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).keys) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_hash_set),
+            "::",
+            stringify!(keys)
         )
     );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cgraph {
+    pub size: ::std::os::raw::c_int,
     pub n_nodes: ::std::os::raw::c_int,
     pub n_leafs: ::std::os::raw::c_int,
-    pub nodes: [*mut ggml_tensor; 4096usize],
-    pub grads: [*mut ggml_tensor; 4096usize],
-    pub leafs: [*mut ggml_tensor; 4096usize],
-    pub visited_hash_table: [*mut ::std::os::raw::c_void; 8273usize],
+    pub nodes: *mut *mut ggml_tensor,
+    pub grads: *mut *mut ggml_tensor,
+    pub leafs: *mut *mut ggml_tensor,
+    pub visited_hash_table: ggml_hash_set,
+    pub order: ggml_cgraph_eval_order,
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
@@ -848,7 +977,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_cgraph>(),
-        164520usize,
+        80usize,
         concat!("Size of: ", stringify!(ggml_cgraph))
     );
     assert_eq!(
@@ -857,8 +986,18 @@ fn bindgen_test_layout_ggml_cgraph() {
         concat!("Alignment of ", stringify!(ggml_cgraph))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
         0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        4usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -868,7 +1007,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize },
-        4usize,
+        8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -878,7 +1017,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nodes) as usize - ptr as usize },
-        8usize,
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -888,7 +1027,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).grads) as usize - ptr as usize },
-        32776usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -898,7 +1037,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).leafs) as usize - ptr as usize },
-        65544usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -908,7 +1047,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).visited_hash_table) as usize - ptr as usize },
-        98312usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -916,9 +1055,19 @@ fn bindgen_test_layout_ggml_cgraph() {
             stringify!(visited_hash_table)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).order) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(order)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        164496usize,
+        60usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -928,7 +1077,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        164504usize,
+        64usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -938,7 +1087,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        164512usize,
+        72usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -947,7 +1096,6 @@ fn bindgen_test_layout_ggml_cgraph() {
         )
     );
 }
-pub const GGML_GRAPH_SIZE: usize = 164520;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_scratch {
@@ -1145,6 +1293,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cycles_per_ms() -> i64;
 }
+extern "C" {
+    pub fn ggml_print_backtrace();
+}
 extern "C" {
     pub fn ggml_numa_init();
 }
@@ -1166,6 +1317,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_nbytes(tensor: *const ggml_tensor) -> usize;
 }
+extern "C" {
+    pub fn ggml_nbytes_pad(tensor: *const ggml_tensor) -> usize;
+}
 extern "C" {
     pub fn ggml_nbytes_split(
         tensor: *const ggml_tensor,
@@ -1208,6 +1362,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_is_permuted(tensor: *const ggml_tensor) -> bool;
 }
+extern "C" {
+    pub fn ggml_are_same_shape(t0: *const ggml_tensor, t1: *const ggml_tensor) -> bool;
+}
 extern "C" {
     pub fn ggml_tensor_overhead() -> usize;
 }
@@ -1290,7 +1447,16 @@ extern "C" {
     pub fn ggml_dup_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_first_tensor(ctx: *mut ggml_context) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_next_tensor(
+        ctx: *mut ggml_context,
+        tensor: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_get_tensor(
@@ -1307,18 +1473,66 @@ extern "C" {
 extern "C" {
     pub fn ggml_set_f32(tensor: *mut ggml_tensor, value: f32) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_unravel_index(
+        tensor: *const ggml_tensor,
+        i: i64,
+        i0: *mut i64,
+        i1: *mut i64,
+        i2: *mut i64,
+        i3: *mut i64,
+    );
+}
 extern "C" {
     pub fn ggml_get_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> i32;
 }
 extern "C" {
     pub fn ggml_set_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: i32);
 }
+extern "C" {
+    pub fn ggml_get_i32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+    ) -> i32;
+}
+extern "C" {
+    pub fn ggml_set_i32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+        value: i32,
+    );
+}
 extern "C" {
     pub fn ggml_get_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> f32;
 }
 extern "C" {
     pub fn ggml_set_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: f32);
 }
+extern "C" {
+    pub fn ggml_get_f32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+    ) -> f32;
+}
+extern "C" {
+    pub fn ggml_set_f32_nd(
+        tensor: *const ggml_tensor,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+        i2: ::std::os::raw::c_int,
+        i3: ::std::os::raw::c_int,
+        value: f32,
+    );
+}
 extern "C" {
     pub fn ggml_get_data(tensor: *const ggml_tensor) -> *mut ::std::os::raw::c_void;
 }
@@ -1364,6 +1578,14 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_add_cast(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        type_: ggml_type,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_add1(
         ctx: *mut ggml_context,
@@ -1486,6 +1708,13 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_concat(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1525,6 +1754,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_leaky(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_relu_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1555,10 +1787,14 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        eps: f32,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32)
@@ -1571,11 +1807,26 @@ extern "C" {
         eps: f32,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_group_norm(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_group_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rms_norm_back(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        eps: f32,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1682,6 +1933,36 @@ extern "C" {
 extern "C" {
     pub fn ggml_cont_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_cont_1d(ctx: *mut ggml_context, a: *mut ggml_tensor, ne0: i64) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont_3d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont_4d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+        ne3: i64,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_reshape(
         ctx: *mut ggml_context,
@@ -1832,6 +2113,14 @@ extern "C" {
 extern "C" {
     pub fn ggml_soft_max_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_soft_max_ext(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        mask: *mut ggml_tensor,
+        scale: f32,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_soft_max_back(
         ctx: *mut ggml_context,
@@ -1850,7 +2139,7 @@ extern "C" {
     pub fn ggml_rope(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
@@ -1860,7 +2149,7 @@ extern "C" {
     pub fn ggml_rope_inplace(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
@@ -1870,34 +2159,73 @@ extern "C" {
     pub fn ggml_rope_custom(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
         freq_base: f32,
         freq_scale: f32,
+        ext_factor: f32,
+        attn_factor: f32,
+        beta_fast: f32,
+        beta_slow: f32,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rope_custom_inplace(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
         freq_base: f32,
         freq_scale: f32,
+        ext_factor: f32,
+        attn_factor: f32,
+        beta_fast: f32,
+        beta_slow: f32,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_rope_yarn_corr_dims(
+        n_dims: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        beta_fast: f32,
+        beta_slow: f32,
+        dims: *mut f32,
+    );
+}
+extern "C" {
+    pub fn ggml_rope_xpos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        n_dims: ::std::os::raw::c_int,
+        base: f32,
+        down: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rope_back(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        n_orig_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        freq_scale: f32,
+        ext_factor: f32,
+        attn_factor: f32,
+        beta_fast: f32,
+        beta_slow: f32,
+        xpos_base: f32,
+        xpos_down: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1918,26 +2246,27 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d(
+    pub fn ggml_im2col(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
+        s1: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
+        p1: ::std::os::raw::c_int,
         d0: ::std::os::raw::c_int,
+        d1: ::std::os::raw::c_int,
+        is_2D: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_2d(
+    pub fn ggml_conv_1d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
-        s1: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
-        p1: ::std::os::raw::c_int,
         d0: ::std::os::raw::c_int,
-        d1: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1949,31 +2278,83 @@ extern "C" {
         d: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
-pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
-pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
-pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
-pub type ggml_op_pool = ::std::os::raw::c_uint;
 extern "C" {
-    pub fn ggml_pool_1d(
+    pub fn ggml_conv_transpose_1d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        op: ggml_op_pool,
-        k0: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
+        d0: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_pool_2d(
+    pub fn ggml_conv_2d(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
-        op: ggml_op_pool,
-        k0: ::std::os::raw::c_int,
-        k1: ::std::os::raw::c_int,
+        b: *mut ggml_tensor,
         s0: ::std::os::raw::c_int,
         s1: ::std::os::raw::c_int,
         p0: ::std::os::raw::c_int,
         p1: ::std::os::raw::c_int,
+        d0: ::std::os::raw::c_int,
+        d1: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_sk_p0(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_s1_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_transpose_2d_p0(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        stride: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
+pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
+pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
+pub type ggml_op_pool = ::std::os::raw::c_uint;
+extern "C" {
+    pub fn ggml_pool_1d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_op_pool,
+        k0: ::std::os::raw::c_int,
+        s0: ::std::os::raw::c_int,
+        p0: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_pool_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_op_pool,
+        k0: ::std::os::raw::c_int,
+        k1: ::std::os::raw::c_int,
+        s0: ::std::os::raw::c_int,
+        s1: ::std::os::raw::c_int,
+        p0: f32,
+        p1: f32,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_upscale(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        scale_factor: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -2021,6 +2402,44 @@ extern "C" {
         w: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_unary(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_unary_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        qh: ::std::os::raw::c_int,
+        kh: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 pub type ggml_unary_op_f32_t = ::std::option::Option<
     unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
 >;
@@ -2049,20 +2468,6 @@ pub type ggml_custom3_op_f32_t = ::std::option::Option<
         arg4: *const ggml_tensor,
     ),
 >;
-extern "C" {
-    pub fn ggml_unary(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_unary_inplace(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
 extern "C" {
     pub fn ggml_map_unary_f32(
         ctx: *mut ggml_context,
@@ -2141,6 +2546,96 @@ extern "C" {
         fun: ggml_custom3_op_f32_t,
     ) -> *mut ggml_tensor;
 }
+pub type ggml_custom1_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom2_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom3_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        c: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+extern "C" {
+    pub fn ggml_map_custom1(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom1_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_cross_entropy_loss(
         ctx: *mut ggml_context,
@@ -2163,27 +2658,49 @@ extern "C" {
     pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
 }
 extern "C" {
-    pub fn ggml_build_forward(tensor: *mut ggml_tensor) -> ggml_cgraph;
-}
-extern "C" {
-    pub fn ggml_build_backward(
+    pub fn ggml_build_backward_expand(
         ctx: *mut ggml_context,
         gf: *mut ggml_cgraph,
+        gb: *mut ggml_cgraph,
         keep: bool,
-    ) -> ggml_cgraph;
+    );
 }
 extern "C" {
     pub fn ggml_new_graph(ctx: *mut ggml_context) -> *mut ggml_cgraph;
 }
 extern "C" {
-    pub fn ggml_build_forward_ctx(
+    pub fn ggml_new_graph_custom(
         ctx: *mut ggml_context,
-        tensor: *mut ggml_tensor,
+        size: usize,
+        grads: bool,
     ) -> *mut ggml_cgraph;
 }
+extern "C" {
+    pub fn ggml_graph_dup(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph) -> *mut ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_graph_view(
+        ctx: *mut ggml_context,
+        cgraph: *mut ggml_cgraph,
+        i0: ::std::os::raw::c_int,
+        i1: ::std::os::raw::c_int,
+    ) -> *mut ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_graph_cpy(src: *mut ggml_cgraph, dst: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_clear(cgraph: *mut ggml_cgraph);
+}
 extern "C" {
     pub fn ggml_graph_overhead() -> usize;
 }
+extern "C" {
+    pub fn ggml_graph_overhead_custom(size: usize, grads: bool) -> usize;
+}
 extern "C" {
     pub fn ggml_graph_plan(
         cgraph: *mut ggml_cgraph,
@@ -2196,9 +2713,6 @@ extern "C" {
         cplan: *mut ggml_cplan,
     ) -> ::std::os::raw::c_int;
 }
-extern "C" {
-    pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
-}
 extern "C" {
     pub fn ggml_graph_compute_with_ctx(
         ctx: *mut ggml_context,
@@ -2220,7 +2734,7 @@ extern "C" {
         fname: *const ::std::os::raw::c_char,
         ctx_data: *mut *mut ggml_context,
         ctx_eval: *mut *mut ggml_context,
-    ) -> ggml_cgraph;
+    ) -> *mut ggml_cgraph;
 }
 extern "C" {
     pub fn ggml_graph_print(cgraph: *const ggml_cgraph);
@@ -2232,6 +2746,16 @@ extern "C" {
         filename: *const ::std::os::raw::c_char,
     );
 }
+extern "C" {
+    pub fn ggml_build_backward_gradient_checkpointing(
+        ctx: *mut ggml_context,
+        gf: *mut ggml_cgraph,
+        gb: *mut ggml_cgraph,
+        gb_tmp: *mut ggml_cgraph,
+        checkpoints: *mut *mut ggml_tensor,
+        n_checkpoints: ::std::os::raw::c_int,
+    );
+}
 pub const ggml_opt_type_GGML_OPT_ADAM: ggml_opt_type = 0;
 pub const ggml_opt_type_GGML_OPT_LBFGS: ggml_opt_type = 1;
 pub type ggml_opt_type = ::std::os::raw::c_uint;
@@ -2245,22 +2769,40 @@ pub const ggml_opt_result_GGML_OPT_DID_NOT_CONVERGE: ggml_opt_result = 1;
 pub const ggml_opt_result_GGML_OPT_NO_CONTEXT: ggml_opt_result = 2;
 pub const ggml_opt_result_GGML_OPT_INVALID_WOLFE: ggml_opt_result = 3;
 pub const ggml_opt_result_GGML_OPT_FAIL: ggml_opt_result = 4;
+pub const ggml_opt_result_GGML_OPT_CANCEL: ggml_opt_result = 5;
 pub const ggml_opt_result_GGML_LINESEARCH_FAIL: ggml_opt_result = -128;
 pub const ggml_opt_result_GGML_LINESEARCH_MINIMUM_STEP: ggml_opt_result = -127;
 pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_STEP: ggml_opt_result = -126;
 pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_ITERATIONS: ggml_opt_result = -125;
 pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result = -124;
 pub type ggml_opt_result = ::std::os::raw::c_int;
+pub type ggml_opt_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        data: *mut ::std::os::raw::c_void,
+        accum_step: ::std::os::raw::c_int,
+        sched: *mut f32,
+        cancel: *mut bool,
+    ),
+>;
+pub type ggml_log_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        level: ggml_log_level,
+        text: *const ::std::os::raw::c_char,
+        user_data: *mut ::std::os::raw::c_void,
+    ),
+>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_params {
     pub type_: ggml_opt_type,
+    pub graph_size: usize,
     pub n_threads: ::std::os::raw::c_int,
     pub past: ::std::os::raw::c_int,
     pub delta: f32,
     pub max_no_improvement: ::std::os::raw::c_int,
     pub print_forward_graph: bool,
     pub print_backward_graph: bool,
+    pub n_gradient_accumulation: ::std::os::raw::c_int,
     pub adam: ggml_opt_params__bindgen_ty_1,
     pub lbfgs: ggml_opt_params__bindgen_ty_2,
 }
@@ -2270,12 +2812,14 @@ pub struct ggml_opt_params__bindgen_ty_1 {
     pub n_iter: ::std::os::raw::c_int,
     pub sched: f32,
     pub decay: f32,
+    pub decay_min_ndim: ::std::os::raw::c_int,
     pub alpha: f32,
     pub beta1: f32,
     pub beta2: f32,
     pub eps: f32,
     pub eps_f: f32,
     pub eps_g: f32,
+    pub gclip: f32,
 }
 #[test]
 fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
@@ -2284,7 +2828,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params__bindgen_ty_1>(),
-        36usize,
+        44usize,
         concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_1))
     );
     assert_eq!(
@@ -2323,8 +2867,18 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).decay_min_ndim) as usize - ptr as usize },
         12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(decay_min_ndim)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2334,7 +2888,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta1) as usize - ptr as usize },
-        16usize,
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2344,7 +2898,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta2) as usize - ptr as usize },
-        20usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2354,7 +2908,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
-        24usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2364,7 +2918,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_f) as usize - ptr as usize },
-        28usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2374,7 +2928,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_g) as usize - ptr as usize },
-        32usize,
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2382,6 +2936,16 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
             stringify!(eps_g)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gclip) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(gclip)
+        )
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -2508,12 +3072,12 @@ fn bindgen_test_layout_ggml_opt_params() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params>(),
-        96usize,
+        120usize,
         concat!("Size of: ", stringify!(ggml_opt_params))
     );
     assert_eq!(
         ::std::mem::align_of::<ggml_opt_params>(),
-        4usize,
+        8usize,
         concat!("Alignment of ", stringify!(ggml_opt_params))
     );
     assert_eq!(
@@ -2526,9 +3090,19 @@ fn bindgen_test_layout_ggml_opt_params() {
             stringify!(type_)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).graph_size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(graph_size)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
-        4usize,
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2538,7 +3112,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).past) as usize - ptr as usize },
-        8usize,
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2548,7 +3122,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).delta) as usize - ptr as usize },
-        12usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2558,7 +3132,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).max_no_improvement) as usize - ptr as usize },
-        16usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2568,7 +3142,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).print_forward_graph) as usize - ptr as usize },
-        20usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2578,7 +3152,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).print_backward_graph) as usize - ptr as usize },
-        21usize,
+        33usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2586,9 +3160,19 @@ fn bindgen_test_layout_ggml_opt_params() {
             stringify!(print_backward_graph)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_gradient_accumulation) as usize - ptr as usize },
+        36usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(n_gradient_accumulation)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        24usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2598,7 +3182,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        60usize,
+        84usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2615,19 +3199,17 @@ pub struct ggml_opt_context {
     pub iter: ::std::os::raw::c_int,
     pub nx: i64,
     pub just_initialized: bool,
+    pub loss_before: f32,
+    pub loss_after: f32,
     pub adam: ggml_opt_context__bindgen_ty_1,
     pub lbfgs: ggml_opt_context__bindgen_ty_2,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_context__bindgen_ty_1 {
-    pub x: *mut ggml_tensor,
-    pub g1: *mut ggml_tensor,
-    pub g2: *mut ggml_tensor,
+    pub g: *mut ggml_tensor,
     pub m: *mut ggml_tensor,
     pub v: *mut ggml_tensor,
-    pub mh: *mut ggml_tensor,
-    pub vh: *mut ggml_tensor,
     pub pf: *mut ggml_tensor,
     pub fx_best: f32,
     pub fx_prev: f32,
@@ -2640,7 +3222,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_context__bindgen_ty_1>(),
-        80usize,
+        48usize,
         concat!("Size of: ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
@@ -2649,78 +3231,38 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
         concat!("Alignment of ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).x) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).g) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(x)
+            stringify!(g)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g1) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g1)
+            stringify!(m)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g2) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g2)
+            stringify!(v)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
         24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(m)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(v)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mh) as usize - ptr as usize },
-        40usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(mh)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vh) as usize - ptr as usize },
-        48usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(vh)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
-        56usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2730,7 +3272,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).fx_best) as usize - ptr as usize },
-        64usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2740,7 +3282,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).fx_prev) as usize - ptr as usize },
-        68usize,
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2750,7 +3292,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).n_no_improvement) as usize - ptr as usize },
-        72usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
@@ -2991,7 +3533,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).iter) as usize - ptr as usize },
-        104usize,
+        128usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3001,7 +3543,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nx) as usize - ptr as usize },
-        112usize,
+        136usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3011,7 +3553,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).just_initialized) as usize - ptr as usize },
-        120usize,
+        144usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3019,9 +3561,29 @@ fn bindgen_test_layout_ggml_opt_context() {
             stringify!(just_initialized)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_before) as usize - ptr as usize },
+        148usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_before)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_after) as usize - ptr as usize },
+        152usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_after)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        128usize,
+        160usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -3072,6 +3634,8 @@ extern "C" {
         f: *mut ggml_tensor,
         gf: *mut ggml_cgraph,
         gb: *mut ggml_cgraph,
+        callback: ggml_opt_callback,
+        callback_data: *mut ::std::os::raw::c_void,
     ) -> ggml_opt_result;
 }
 extern "C" {
@@ -3120,267 +3684,1781 @@ extern "C" {
     ) -> usize;
 }
 extern "C" {
-    pub fn ggml_quantize_chunk(
-        type_: ggml_type,
+    pub fn ggml_quantize_q2_K(
         src: *const f32,
         dst: *mut ::std::os::raw::c_void,
-        start: ::std::os::raw::c_int,
         n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
         hist: *mut i64,
     ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx2() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512_vbmi() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512_vnni() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_fma() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_neon() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_fp16_va() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_wasm_simd() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_blas() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q3_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_cublas() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q4_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_clblast() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q5_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_gpublas() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_q6_K(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
 extern "C" {
-    pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
+    pub fn ggml_quantize_chunk(
+        type_: ggml_type,
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        start: ::std::os::raw::c_int,
+        n: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
 }
-extern "C" {
-    pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
+pub const gguf_type_GGUF_TYPE_UINT8: gguf_type = 0;
+pub const gguf_type_GGUF_TYPE_INT8: gguf_type = 1;
+pub const gguf_type_GGUF_TYPE_UINT16: gguf_type = 2;
+pub const gguf_type_GGUF_TYPE_INT16: gguf_type = 3;
+pub const gguf_type_GGUF_TYPE_UINT32: gguf_type = 4;
+pub const gguf_type_GGUF_TYPE_INT32: gguf_type = 5;
+pub const gguf_type_GGUF_TYPE_FLOAT32: gguf_type = 6;
+pub const gguf_type_GGUF_TYPE_BOOL: gguf_type = 7;
+pub const gguf_type_GGUF_TYPE_STRING: gguf_type = 8;
+pub const gguf_type_GGUF_TYPE_ARRAY: gguf_type = 9;
+pub const gguf_type_GGUF_TYPE_UINT64: gguf_type = 10;
+pub const gguf_type_GGUF_TYPE_INT64: gguf_type = 11;
+pub const gguf_type_GGUF_TYPE_FLOAT64: gguf_type = 12;
+pub const gguf_type_GGUF_TYPE_COUNT: gguf_type = 13;
+pub type gguf_type = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct gguf_context {
+    _unused: [u8; 0],
 }
-pub type ggml_to_float_t = ::std::option::Option<
-    unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
->;
-pub type ggml_from_float_t = ::std::option::Option<
-    unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
->;
-pub type ggml_vec_dot_t = ::std::option::Option<
-    unsafe extern "C" fn(
-        n: ::std::os::raw::c_int,
-        s: *mut f32,
-        x: *const ::std::os::raw::c_void,
-        y: *const ::std::os::raw::c_void,
-    ),
->;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct ggml_type_traits_t {
-    pub to_float: ggml_to_float_t,
-    pub from_float: ggml_from_float_t,
-    pub from_float_reference: ggml_from_float_t,
-    pub vec_dot: ggml_vec_dot_t,
-    pub vec_dot_type: ggml_type,
+pub struct gguf_init_params {
+    pub no_alloc: bool,
+    pub ctx: *mut *mut ggml_context,
 }
 #[test]
-fn bindgen_test_layout_ggml_type_traits_t() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_type_traits_t> = ::std::mem::MaybeUninit::uninit();
+fn bindgen_test_layout_gguf_init_params() {
+    const UNINIT: ::std::mem::MaybeUninit<gguf_init_params> = ::std::mem::MaybeUninit::uninit();
     let ptr = UNINIT.as_ptr();
     assert_eq!(
-        ::std::mem::size_of::<ggml_type_traits_t>(),
-        40usize,
-        concat!("Size of: ", stringify!(ggml_type_traits_t))
+        ::std::mem::size_of::<gguf_init_params>(),
+        16usize,
+        concat!("Size of: ", stringify!(gguf_init_params))
     );
     assert_eq!(
-        ::std::mem::align_of::<ggml_type_traits_t>(),
+        ::std::mem::align_of::<gguf_init_params>(),
         8usize,
-        concat!("Alignment of ", stringify!(ggml_type_traits_t))
+        concat!("Alignment of ", stringify!(gguf_init_params))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).no_alloc) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_type_traits_t),
+            stringify!(gguf_init_params),
             "::",
-            stringify!(to_float)
+            stringify!(no_alloc)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).ctx) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
-            stringify!(ggml_type_traits_t),
-            "::",
-            stringify!(from_float)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_type_traits_t),
-            "::",
-            stringify!(from_float_reference)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_type_traits_t),
-            "::",
-            stringify!(vec_dot)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_type_traits_t),
+            stringify!(gguf_init_params),
             "::",
-            stringify!(vec_dot_type)
+            stringify!(ctx)
         )
     );
 }
 extern "C" {
-    pub fn ggml_internal_get_type_traits(i: ggml_type) -> ggml_type_traits_t;
+    pub fn gguf_init_empty() -> *mut gguf_context;
 }
 extern "C" {
-    pub fn ggml_init_cublas();
+    pub fn gguf_init_from_file(
+        fname: *const ::std::os::raw::c_char,
+        params: gguf_init_params,
+    ) -> *mut gguf_context;
 }
 extern "C" {
-    pub fn ggml_cuda_set_tensor_split(tensor_split: *const f32);
+    pub fn gguf_free(ctx: *mut gguf_context);
 }
 extern "C" {
-    pub fn ggml_cuda_mul(src0: *const ggml_tensor, src1: *const ggml_tensor, dst: *mut ggml_tensor);
+    pub fn gguf_type_name(type_: gguf_type) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn ggml_cuda_can_mul_mat(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-    ) -> bool;
+    pub fn gguf_get_version(ctx: *const gguf_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_cuda_mul_mat_get_wsize(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-    ) -> usize;
+    pub fn gguf_get_alignment(ctx: *const gguf_context) -> usize;
 }
 extern "C" {
-    pub fn ggml_cuda_mul_mat(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-        wdata: *mut ::std::os::raw::c_void,
-        wsize: usize,
-    );
+    pub fn gguf_get_data_offset(ctx: *const gguf_context) -> usize;
 }
 extern "C" {
-    pub fn ggml_cuda_host_malloc(size: usize) -> *mut ::std::os::raw::c_void;
+    pub fn gguf_get_data(ctx: *const gguf_context) -> *mut ::std::os::raw::c_void;
 }
 extern "C" {
-    pub fn ggml_cuda_host_free(ptr: *mut ::std::os::raw::c_void);
+    pub fn gguf_get_n_kv(ctx: *const gguf_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_cuda_transform_tensor(data: *mut ::std::os::raw::c_void, tensor: *mut ggml_tensor);
+    pub fn gguf_find_key(
+        ctx: *const gguf_context,
+        key: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_cuda_free_data(tensor: *mut ggml_tensor);
+    pub fn gguf_get_key(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn ggml_cuda_assign_buffers(tensor: *mut ggml_tensor);
+    pub fn gguf_get_kv_type(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> gguf_type;
 }
 extern "C" {
-    pub fn ggml_cuda_assign_buffers_no_scratch(tensor: *mut ggml_tensor);
+    pub fn gguf_get_arr_type(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> gguf_type;
 }
 extern "C" {
-    pub fn ggml_cuda_assign_buffers_force_inplace(tensor: *mut ggml_tensor);
+    pub fn gguf_get_val_u8(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u8;
 }
 extern "C" {
-    pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
+    pub fn gguf_get_val_i8(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i8;
 }
 extern "C" {
-    pub fn ggml_cuda_set_mul_mat_q(mul_mat_q: bool);
+    pub fn gguf_get_val_u16(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u16;
 }
 extern "C" {
-    pub fn ggml_cuda_set_scratch_size(scratch_size: usize);
+    pub fn gguf_get_val_i16(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i16;
 }
 extern "C" {
-    pub fn ggml_cuda_free_scratch();
+    pub fn gguf_get_val_u32(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u32;
 }
 extern "C" {
-    pub fn ggml_cuda_compute_forward(
-        params: *mut ggml_compute_params,
-        tensor: *mut ggml_tensor,
-    ) -> bool;
+    pub fn gguf_get_val_i32(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i32;
 }
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct llama_model {
-    _unused: [u8; 0],
+extern "C" {
+    pub fn gguf_get_val_f32(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> f32;
 }
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct llama_context {
-    _unused: [u8; 0],
+extern "C" {
+    pub fn gguf_get_val_u64(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> u64;
 }
-pub type llama_token = ::std::os::raw::c_int;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct llama_token_data {
-    pub id: llama_token,
-    pub logit: f32,
-    pub p: f32,
+extern "C" {
+    pub fn gguf_get_val_i64(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> i64;
 }
-#[test]
-fn bindgen_test_layout_llama_token_data() {
-    const UNINIT: ::std::mem::MaybeUninit<llama_token_data> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<llama_token_data>(),
-        12usize,
-        concat!("Size of: ", stringify!(llama_token_data))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<llama_token_data>(),
-        4usize,
-        concat!("Alignment of ", stringify!(llama_token_data))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).id) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_token_data),
-            "::",
+extern "C" {
+    pub fn gguf_get_val_f64(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> f64;
+}
+extern "C" {
+    pub fn gguf_get_val_bool(ctx: *const gguf_context, key_id: ::std::os::raw::c_int) -> bool;
+}
+extern "C" {
+    pub fn gguf_get_val_str(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_val_data(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_arr_n(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_arr_data(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_arr_str(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_n_tensors(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_find_tensor(
+        ctx: *const gguf_context,
+        name: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_tensor_offset(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_tensor_name(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_set_val_u8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u8);
+}
+extern "C" {
+    pub fn gguf_set_val_i8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i8);
+}
+extern "C" {
+    pub fn gguf_set_val_u16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u16);
+}
+extern "C" {
+    pub fn gguf_set_val_i16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i16);
+}
+extern "C" {
+    pub fn gguf_set_val_u32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u32);
+}
+extern "C" {
+    pub fn gguf_set_val_i32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i32);
+}
+extern "C" {
+    pub fn gguf_set_val_f32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f32);
+}
+extern "C" {
+    pub fn gguf_set_val_u64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u64);
+}
+extern "C" {
+    pub fn gguf_set_val_i64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i64);
+}
+extern "C" {
+    pub fn gguf_set_val_f64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f64);
+}
+extern "C" {
+    pub fn gguf_set_val_bool(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: bool);
+}
+extern "C" {
+    pub fn gguf_set_val_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        val: *const ::std::os::raw::c_char,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_data(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        type_: gguf_type,
+        data: *const ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        data: *mut *const ::std::os::raw::c_char,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_kv(ctx: *mut gguf_context, src: *mut gguf_context);
+}
+extern "C" {
+    pub fn gguf_add_tensor(ctx: *mut gguf_context, tensor: *const ggml_tensor);
+}
+extern "C" {
+    pub fn gguf_set_tensor_type(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        type_: ggml_type,
+    );
+}
+extern "C" {
+    pub fn gguf_set_tensor_data(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        data: *const ::std::os::raw::c_void,
+        size: usize,
+    );
+}
+extern "C" {
+    pub fn gguf_write_to_file(
+        ctx: *const gguf_context,
+        fname: *const ::std::os::raw::c_char,
+        only_meta: bool,
+    );
+}
+extern "C" {
+    pub fn gguf_get_meta_size(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_meta_data(ctx: *const gguf_context, data: *mut ::std::os::raw::c_void);
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx2() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512_vbmi() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512_vnni() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_fma() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_neon() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_metal() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_fp16_va() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_wasm_simd() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_blas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_cublas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_clblast() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_gpublas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_ssse3() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
+}
+pub type ggml_to_float_t = ::std::option::Option<
+    unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
+>;
+pub type ggml_from_float_t = ::std::option::Option<
+    unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
+>;
+pub type ggml_vec_dot_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        n: ::std::os::raw::c_int,
+        s: *mut f32,
+        x: *const ::std::os::raw::c_void,
+        y: *const ::std::os::raw::c_void,
+    ),
+>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_type_traits_t {
+    pub type_name: *const ::std::os::raw::c_char,
+    pub blck_size: ::std::os::raw::c_int,
+    pub type_size: usize,
+    pub is_quantized: bool,
+    pub to_float: ggml_to_float_t,
+    pub from_float: ggml_from_float_t,
+    pub from_float_reference: ggml_from_float_t,
+    pub vec_dot: ggml_vec_dot_t,
+    pub vec_dot_type: ggml_type,
+}
+#[test]
+fn bindgen_test_layout_ggml_type_traits_t() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_type_traits_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_type_traits_t>(),
+        72usize,
+        concat!("Size of: ", stringify!(ggml_type_traits_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_type_traits_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_type_traits_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_name) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_name)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).blck_size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(blck_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).is_quantized) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(is_quantized)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(to_float)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(from_float)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
+        48usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(from_float_reference)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(vec_dot)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
+        64usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(vec_dot_type)
+        )
+    );
+}
+extern "C" {
+    pub fn ggml_internal_get_type_traits(type_: ggml_type) -> ggml_type_traits_t;
+}
+pub type va_list = __builtin_va_list;
+pub type __gnuc_va_list = __builtin_va_list;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct __mbstate_t {
+    pub __count: ::std::os::raw::c_int,
+    pub __value: __mbstate_t__bindgen_ty_1,
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union __mbstate_t__bindgen_ty_1 {
+    pub __wch: ::std::os::raw::c_uint,
+    pub __wchb: [::std::os::raw::c_char; 4usize],
+}
+#[test]
+fn bindgen_test_layout___mbstate_t__bindgen_ty_1() {
+    const UNINIT: ::std::mem::MaybeUninit<__mbstate_t__bindgen_ty_1> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<__mbstate_t__bindgen_ty_1>(),
+        4usize,
+        concat!("Size of: ", stringify!(__mbstate_t__bindgen_ty_1))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<__mbstate_t__bindgen_ty_1>(),
+        4usize,
+        concat!("Alignment of ", stringify!(__mbstate_t__bindgen_ty_1))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__wch) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t__bindgen_ty_1),
+            "::",
+            stringify!(__wch)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__wchb) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t__bindgen_ty_1),
+            "::",
+            stringify!(__wchb)
+        )
+    );
+}
+#[test]
+fn bindgen_test_layout___mbstate_t() {
+    const UNINIT: ::std::mem::MaybeUninit<__mbstate_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<__mbstate_t>(),
+        8usize,
+        concat!("Size of: ", stringify!(__mbstate_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<__mbstate_t>(),
+        4usize,
+        concat!("Alignment of ", stringify!(__mbstate_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__count) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t),
+            "::",
+            stringify!(__count)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__value) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__mbstate_t),
+            "::",
+            stringify!(__value)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct _G_fpos_t {
+    pub __pos: __off_t,
+    pub __state: __mbstate_t,
+}
+#[test]
+fn bindgen_test_layout__G_fpos_t() {
+    const UNINIT: ::std::mem::MaybeUninit<_G_fpos_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_G_fpos_t>(),
+        16usize,
+        concat!("Size of: ", stringify!(_G_fpos_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_G_fpos_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_G_fpos_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__pos) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos_t),
+            "::",
+            stringify!(__pos)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__state) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos_t),
+            "::",
+            stringify!(__state)
+        )
+    );
+}
+pub type __fpos_t = _G_fpos_t;
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct _G_fpos64_t {
+    pub __pos: __off64_t,
+    pub __state: __mbstate_t,
+}
+#[test]
+fn bindgen_test_layout__G_fpos64_t() {
+    const UNINIT: ::std::mem::MaybeUninit<_G_fpos64_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_G_fpos64_t>(),
+        16usize,
+        concat!("Size of: ", stringify!(_G_fpos64_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_G_fpos64_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_G_fpos64_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__pos) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos64_t),
+            "::",
+            stringify!(__pos)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__state) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_G_fpos64_t),
+            "::",
+            stringify!(__state)
+        )
+    );
+}
+pub type __fpos64_t = _G_fpos64_t;
+pub type __FILE = _IO_FILE;
+pub type FILE = _IO_FILE;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_marker {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_codecvt {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_wide_data {
+    _unused: [u8; 0],
+}
+pub type _IO_lock_t = ::std::os::raw::c_void;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_FILE {
+    pub _flags: ::std::os::raw::c_int,
+    pub _IO_read_ptr: *mut ::std::os::raw::c_char,
+    pub _IO_read_end: *mut ::std::os::raw::c_char,
+    pub _IO_read_base: *mut ::std::os::raw::c_char,
+    pub _IO_write_base: *mut ::std::os::raw::c_char,
+    pub _IO_write_ptr: *mut ::std::os::raw::c_char,
+    pub _IO_write_end: *mut ::std::os::raw::c_char,
+    pub _IO_buf_base: *mut ::std::os::raw::c_char,
+    pub _IO_buf_end: *mut ::std::os::raw::c_char,
+    pub _IO_save_base: *mut ::std::os::raw::c_char,
+    pub _IO_backup_base: *mut ::std::os::raw::c_char,
+    pub _IO_save_end: *mut ::std::os::raw::c_char,
+    pub _markers: *mut _IO_marker,
+    pub _chain: *mut _IO_FILE,
+    pub _fileno: ::std::os::raw::c_int,
+    pub _flags2: ::std::os::raw::c_int,
+    pub _old_offset: __off_t,
+    pub _cur_column: ::std::os::raw::c_ushort,
+    pub _vtable_offset: ::std::os::raw::c_schar,
+    pub _shortbuf: [::std::os::raw::c_char; 1usize],
+    pub _lock: *mut _IO_lock_t,
+    pub _offset: __off64_t,
+    pub _codecvt: *mut _IO_codecvt,
+    pub _wide_data: *mut _IO_wide_data,
+    pub _freeres_list: *mut _IO_FILE,
+    pub _freeres_buf: *mut ::std::os::raw::c_void,
+    pub __pad5: usize,
+    pub _mode: ::std::os::raw::c_int,
+    pub _unused2: [::std::os::raw::c_char; 20usize],
+}
+#[test]
+fn bindgen_test_layout__IO_FILE() {
+    const UNINIT: ::std::mem::MaybeUninit<_IO_FILE> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_IO_FILE>(),
+        216usize,
+        concat!("Size of: ", stringify!(_IO_FILE))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_IO_FILE>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_IO_FILE))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._flags) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_flags)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_read_ptr) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_read_ptr)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_read_end) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_read_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_read_base) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_read_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_write_base) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_write_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_write_ptr) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_write_ptr)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_write_end) as usize - ptr as usize },
+        48usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_write_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_buf_base) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_buf_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_buf_end) as usize - ptr as usize },
+        64usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_buf_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_save_base) as usize - ptr as usize },
+        72usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_save_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_backup_base) as usize - ptr as usize },
+        80usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_backup_base)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._IO_save_end) as usize - ptr as usize },
+        88usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_IO_save_end)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._markers) as usize - ptr as usize },
+        96usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_markers)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._chain) as usize - ptr as usize },
+        104usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_chain)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._fileno) as usize - ptr as usize },
+        112usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_fileno)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._flags2) as usize - ptr as usize },
+        116usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_flags2)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._old_offset) as usize - ptr as usize },
+        120usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_old_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._cur_column) as usize - ptr as usize },
+        128usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_cur_column)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._vtable_offset) as usize - ptr as usize },
+        130usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_vtable_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._shortbuf) as usize - ptr as usize },
+        131usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_shortbuf)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._lock) as usize - ptr as usize },
+        136usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_lock)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._offset) as usize - ptr as usize },
+        144usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._codecvt) as usize - ptr as usize },
+        152usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_codecvt)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._wide_data) as usize - ptr as usize },
+        160usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_wide_data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._freeres_list) as usize - ptr as usize },
+        168usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_freeres_list)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._freeres_buf) as usize - ptr as usize },
+        176usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_freeres_buf)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).__pad5) as usize - ptr as usize },
+        184usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(__pad5)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._mode) as usize - ptr as usize },
+        192usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_mode)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr)._unused2) as usize - ptr as usize },
+        196usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_FILE),
+            "::",
+            stringify!(_unused2)
+        )
+    );
+}
+pub type cookie_read_function_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        __cookie: *mut ::std::os::raw::c_void,
+        __buf: *mut ::std::os::raw::c_char,
+        __nbytes: usize,
+    ) -> __ssize_t,
+>;
+pub type cookie_write_function_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        __cookie: *mut ::std::os::raw::c_void,
+        __buf: *const ::std::os::raw::c_char,
+        __nbytes: usize,
+    ) -> __ssize_t,
+>;
+pub type cookie_seek_function_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        __cookie: *mut ::std::os::raw::c_void,
+        __pos: *mut __off64_t,
+        __w: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int,
+>;
+pub type cookie_close_function_t = ::std::option::Option<
+    unsafe extern "C" fn(__cookie: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int,
+>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct _IO_cookie_io_functions_t {
+    pub read: cookie_read_function_t,
+    pub write: cookie_write_function_t,
+    pub seek: cookie_seek_function_t,
+    pub close: cookie_close_function_t,
+}
+#[test]
+fn bindgen_test_layout__IO_cookie_io_functions_t() {
+    const UNINIT: ::std::mem::MaybeUninit<_IO_cookie_io_functions_t> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<_IO_cookie_io_functions_t>(),
+        32usize,
+        concat!("Size of: ", stringify!(_IO_cookie_io_functions_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<_IO_cookie_io_functions_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(_IO_cookie_io_functions_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).read) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(read)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).write) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(write)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).seek) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(seek)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).close) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(_IO_cookie_io_functions_t),
+            "::",
+            stringify!(close)
+        )
+    );
+}
+pub type cookie_io_functions_t = _IO_cookie_io_functions_t;
+pub type off_t = __off_t;
+pub type off64_t = __off64_t;
+pub type fpos_t = __fpos_t;
+pub type fpos64_t = __fpos64_t;
+extern "C" {
+    pub static mut stdin: *mut FILE;
+}
+extern "C" {
+    pub static mut stdout: *mut FILE;
+}
+extern "C" {
+    pub static mut stderr: *mut FILE;
+}
+extern "C" {
+    pub fn remove(__filename: *const ::std::os::raw::c_char) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn rename(
+        __old: *const ::std::os::raw::c_char,
+        __new: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn renameat(
+        __oldfd: ::std::os::raw::c_int,
+        __old: *const ::std::os::raw::c_char,
+        __newfd: ::std::os::raw::c_int,
+        __new: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn renameat2(
+        __oldfd: ::std::os::raw::c_int,
+        __old: *const ::std::os::raw::c_char,
+        __newfd: ::std::os::raw::c_int,
+        __new: *const ::std::os::raw::c_char,
+        __flags: ::std::os::raw::c_uint,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fclose(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn tmpfile() -> *mut FILE;
+}
+extern "C" {
+    pub fn tmpfile64() -> *mut FILE;
+}
+extern "C" {
+    pub fn tmpnam(arg1: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn tmpnam_r(__s: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn tempnam(
+        __dir: *const ::std::os::raw::c_char,
+        __pfx: *const ::std::os::raw::c_char,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn fflush(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fflush_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fcloseall() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fopen(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn freopen(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+        __stream: *mut FILE,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn fopen64(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn freopen64(
+        __filename: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+        __stream: *mut FILE,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn fdopen(__fd: ::std::os::raw::c_int, __modes: *const ::std::os::raw::c_char)
+        -> *mut FILE;
+}
+extern "C" {
+    pub fn fopencookie(
+        __magic_cookie: *mut ::std::os::raw::c_void,
+        __modes: *const ::std::os::raw::c_char,
+        __io_funcs: cookie_io_functions_t,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn fmemopen(
+        __s: *mut ::std::os::raw::c_void,
+        __len: usize,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn open_memstream(
+        __bufloc: *mut *mut ::std::os::raw::c_char,
+        __sizeloc: *mut usize,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn setbuf(__stream: *mut FILE, __buf: *mut ::std::os::raw::c_char);
+}
+extern "C" {
+    pub fn setvbuf(
+        __stream: *mut FILE,
+        __buf: *mut ::std::os::raw::c_char,
+        __modes: ::std::os::raw::c_int,
+        __n: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn setbuffer(__stream: *mut FILE, __buf: *mut ::std::os::raw::c_char, __size: usize);
+}
+extern "C" {
+    pub fn setlinebuf(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn fprintf(
+        __stream: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn printf(__format: *const ::std::os::raw::c_char, ...) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn sprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vfprintf(
+        __s: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vprintf(
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vsprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn snprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __maxlen: usize,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vsnprintf(
+        __s: *mut ::std::os::raw::c_char,
+        __maxlen: usize,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vasprintf(
+        __ptr: *mut *mut ::std::os::raw::c_char,
+        __f: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn __asprintf(
+        __ptr: *mut *mut ::std::os::raw::c_char,
+        __fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn asprintf(
+        __ptr: *mut *mut ::std::os::raw::c_char,
+        __fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vdprintf(
+        __fd: ::std::os::raw::c_int,
+        __fmt: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn dprintf(
+        __fd: ::std::os::raw::c_int,
+        __fmt: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fscanf(
+        __stream: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn scanf(__format: *const ::std::os::raw::c_char, ...) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn sscanf(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+pub type _Float32 = f32;
+pub type _Float64 = f64;
+pub type _Float32x = f64;
+pub type _Float64x = u128;
+extern "C" {
+    #[link_name = "\u{1}__isoc99_fscanf"]
+    pub fn fscanf1(
+        __stream: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_scanf"]
+    pub fn scanf1(__format: *const ::std::os::raw::c_char, ...) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_sscanf"]
+    pub fn sscanf1(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vfscanf(
+        __s: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vscanf(
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn vsscanf(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_vfscanf"]
+    pub fn vfscanf1(
+        __s: *mut FILE,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_vscanf"]
+    pub fn vscanf1(
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    #[link_name = "\u{1}__isoc99_vsscanf"]
+    pub fn vsscanf1(
+        __s: *const ::std::os::raw::c_char,
+        __format: *const ::std::os::raw::c_char,
+        __arg: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fgetc(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getc(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getchar() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getc_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getchar_unlocked() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fgetc_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fputc(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putc(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putchar(__c: ::std::os::raw::c_int) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fputc_unlocked(__c: ::std::os::raw::c_int, __stream: *mut FILE)
+        -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putc_unlocked(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putchar_unlocked(__c: ::std::os::raw::c_int) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn getw(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn putw(__w: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fgets(
+        __s: *mut ::std::os::raw::c_char,
+        __n: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn fgets_unlocked(
+        __s: *mut ::std::os::raw::c_char,
+        __n: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn __getdelim(
+        __lineptr: *mut *mut ::std::os::raw::c_char,
+        __n: *mut usize,
+        __delimiter: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> __ssize_t;
+}
+extern "C" {
+    pub fn getdelim(
+        __lineptr: *mut *mut ::std::os::raw::c_char,
+        __n: *mut usize,
+        __delimiter: ::std::os::raw::c_int,
+        __stream: *mut FILE,
+    ) -> __ssize_t;
+}
+extern "C" {
+    pub fn getline(
+        __lineptr: *mut *mut ::std::os::raw::c_char,
+        __n: *mut usize,
+        __stream: *mut FILE,
+    ) -> __ssize_t;
+}
+extern "C" {
+    pub fn fputs(__s: *const ::std::os::raw::c_char, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn puts(__s: *const ::std::os::raw::c_char) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ungetc(__c: ::std::os::raw::c_int, __stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fread(
+        __ptr: *mut ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __stream: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fwrite(
+        __ptr: *const ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __s: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fputs_unlocked(
+        __s: *const ::std::os::raw::c_char,
+        __stream: *mut FILE,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fread_unlocked(
+        __ptr: *mut ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __stream: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fwrite_unlocked(
+        __ptr: *const ::std::os::raw::c_void,
+        __size: usize,
+        __n: usize,
+        __stream: *mut FILE,
+    ) -> usize;
+}
+extern "C" {
+    pub fn fseek(
+        __stream: *mut FILE,
+        __off: ::std::os::raw::c_long,
+        __whence: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ftell(__stream: *mut FILE) -> ::std::os::raw::c_long;
+}
+extern "C" {
+    pub fn rewind(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn fseeko(
+        __stream: *mut FILE,
+        __off: __off_t,
+        __whence: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ftello(__stream: *mut FILE) -> __off_t;
+}
+extern "C" {
+    pub fn fgetpos(__stream: *mut FILE, __pos: *mut fpos_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fsetpos(__stream: *mut FILE, __pos: *const fpos_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fseeko64(
+        __stream: *mut FILE,
+        __off: __off64_t,
+        __whence: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ftello64(__stream: *mut FILE) -> __off64_t;
+}
+extern "C" {
+    pub fn fgetpos64(__stream: *mut FILE, __pos: *mut fpos64_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fsetpos64(__stream: *mut FILE, __pos: *const fpos64_t) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn clearerr(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn feof(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ferror(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn clearerr_unlocked(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn feof_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ferror_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn perror(__s: *const ::std::os::raw::c_char);
+}
+extern "C" {
+    pub fn fileno(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn fileno_unlocked(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn pclose(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn popen(
+        __command: *const ::std::os::raw::c_char,
+        __modes: *const ::std::os::raw::c_char,
+    ) -> *mut FILE;
+}
+extern "C" {
+    pub fn ctermid(__s: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn cuserid(__s: *mut ::std::os::raw::c_char) -> *mut ::std::os::raw::c_char;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct obstack {
+    _unused: [u8; 0],
+}
+extern "C" {
+    pub fn obstack_printf(
+        __obstack: *mut obstack,
+        __format: *const ::std::os::raw::c_char,
+        ...
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn obstack_vprintf(
+        __obstack: *mut obstack,
+        __format: *const ::std::os::raw::c_char,
+        __args: *mut __va_list_tag,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn flockfile(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn ftrylockfile(__stream: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn funlockfile(__stream: *mut FILE);
+}
+extern "C" {
+    pub fn __uflow(arg1: *mut FILE) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn __overflow(arg1: *mut FILE, arg2: ::std::os::raw::c_int) -> ::std::os::raw::c_int;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_model {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_context {
+    _unused: [u8; 0],
+}
+pub type llama_pos = i32;
+pub type llama_token = i32;
+pub type llama_seq_id = i32;
+pub const llama_vocab_type_LLAMA_VOCAB_TYPE_SPM: llama_vocab_type = 0;
+pub const llama_vocab_type_LLAMA_VOCAB_TYPE_BPE: llama_vocab_type = 1;
+pub type llama_vocab_type = ::std::os::raw::c_uint;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_UNDEFINED: llama_token_type = 0;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_NORMAL: llama_token_type = 1;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_UNKNOWN: llama_token_type = 2;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_CONTROL: llama_token_type = 3;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_USER_DEFINED: llama_token_type = 4;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_UNUSED: llama_token_type = 5;
+pub const llama_token_type_LLAMA_TOKEN_TYPE_BYTE: llama_token_type = 6;
+pub type llama_token_type = ::std::os::raw::c_uint;
+pub const llama_ftype_LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1: llama_ftype = 3;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: llama_ftype = 4;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q8_0: llama_ftype = 7;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_0: llama_ftype = 8;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_1: llama_ftype = 9;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q2_K: llama_ftype = 10;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_S: llama_ftype = 11;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_M: llama_ftype = 12;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_L: llama_ftype = 13;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_S: llama_ftype = 14;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
+pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
+pub const llama_ftype_LLAMA_FTYPE_GUESSED: llama_ftype = 1024;
+pub type llama_ftype = ::std::os::raw::c_uint;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_UNSPECIFIED: llama_rope_scaling_type = -1;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_NONE: llama_rope_scaling_type = 0;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_LINEAR: llama_rope_scaling_type = 1;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_YARN: llama_rope_scaling_type = 2;
+pub const llama_rope_scaling_type_LLAMA_ROPE_SCALING_MAX_VALUE: llama_rope_scaling_type = 2;
+pub type llama_rope_scaling_type = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_token_data {
+    pub id: llama_token,
+    pub logit: f32,
+    pub p: f32,
+}
+#[test]
+fn bindgen_test_layout_llama_token_data() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_token_data> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_token_data>(),
+        12usize,
+        concat!("Size of: ", stringify!(llama_token_data))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_token_data>(),
+        4usize,
+        concat!("Alignment of ", stringify!(llama_token_data))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).id) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_token_data),
+            "::",
             stringify!(id)
         )
     );
@@ -3462,26 +5540,259 @@ pub type llama_progress_callback =
     ::std::option::Option<unsafe extern "C" fn(progress: f32, ctx: *mut ::std::os::raw::c_void)>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
-pub struct llama_context_params {
-    pub seed: u32,
-    pub n_ctx: i32,
-    pub n_batch: i32,
-    pub n_gqa: i32,
-    pub rms_norm_eps: f32,
+pub struct llama_batch {
+    pub n_tokens: i32,
+    pub token: *mut llama_token,
+    pub embd: *mut f32,
+    pub pos: *mut llama_pos,
+    pub n_seq_id: *mut i32,
+    pub seq_id: *mut *mut llama_seq_id,
+    pub logits: *mut i8,
+    pub all_pos_0: llama_pos,
+    pub all_pos_1: llama_pos,
+    pub all_seq_id: llama_seq_id,
+}
+#[test]
+fn bindgen_test_layout_llama_batch() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_batch> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_batch>(),
+        72usize,
+        concat!("Size of: ", stringify!(llama_batch))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_batch>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_batch))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_tokens) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(n_tokens)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).token) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(token)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).embd) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(embd)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).pos) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(pos)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_seq_id) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(n_seq_id)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).seq_id) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(seq_id)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).logits) as usize - ptr as usize },
+        48usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(logits)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).all_pos_0) as usize - ptr as usize },
+        56usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(all_pos_0)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).all_pos_1) as usize - ptr as usize },
+        60usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(all_pos_1)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).all_seq_id) as usize - ptr as usize },
+        64usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_batch),
+            "::",
+            stringify!(all_seq_id)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_model_params {
     pub n_gpu_layers: i32,
     pub main_gpu: i32,
     pub tensor_split: *const f32,
-    pub rope_freq_base: f32,
-    pub rope_freq_scale: f32,
     pub progress_callback: llama_progress_callback,
     pub progress_callback_user_data: *mut ::std::os::raw::c_void,
-    pub low_vram: bool,
-    pub mul_mat_q: bool,
-    pub f16_kv: bool,
-    pub logits_all: bool,
     pub vocab_only: bool,
     pub use_mmap: bool,
     pub use_mlock: bool,
+}
+#[test]
+fn bindgen_test_layout_llama_model_params() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_model_params> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_model_params>(),
+        40usize,
+        concat!("Size of: ", stringify!(llama_model_params))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_model_params>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_model_params))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_gpu_layers) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(n_gpu_layers)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).main_gpu) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(main_gpu)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).tensor_split) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(tensor_split)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(progress_callback)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback_user_data) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(progress_callback_user_data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vocab_only) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(vocab_only)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).use_mmap) as usize - ptr as usize },
+        33usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(use_mmap)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).use_mlock) as usize - ptr as usize },
+        34usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_params),
+            "::",
+            stringify!(use_mlock)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_context_params {
+    pub seed: u32,
+    pub n_ctx: u32,
+    pub n_batch: u32,
+    pub n_threads: u32,
+    pub n_threads_batch: u32,
+    pub rope_scaling_type: i8,
+    pub rope_freq_base: f32,
+    pub rope_freq_scale: f32,
+    pub yarn_ext_factor: f32,
+    pub yarn_attn_factor: f32,
+    pub yarn_beta_fast: f32,
+    pub yarn_beta_slow: f32,
+    pub yarn_orig_ctx: u32,
+    pub mul_mat_q: bool,
+    pub f16_kv: bool,
+    pub logits_all: bool,
     pub embedding: bool,
 }
 #[test]
@@ -3490,12 +5801,12 @@ fn bindgen_test_layout_llama_context_params() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<llama_context_params>(),
-        72usize,
+        56usize,
         concat!("Size of: ", stringify!(llama_context_params))
     );
     assert_eq!(
         ::std::mem::align_of::<llama_context_params>(),
-        8usize,
+        4usize,
         concat!("Alignment of ", stringify!(llama_context_params))
     );
     assert_eq!(
@@ -3529,58 +5840,38 @@ fn bindgen_test_layout_llama_context_params() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_gqa) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
         12usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(n_gqa)
+            stringify!(n_threads)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).rms_norm_eps) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).n_threads_batch) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(rms_norm_eps)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_gpu_layers) as usize - ptr as usize },
-        20usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_context_params),
-            "::",
-            stringify!(n_gpu_layers)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).main_gpu) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_context_params),
-            "::",
-            stringify!(main_gpu)
+            stringify!(n_threads_batch)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).tensor_split) as usize - ptr as usize },
-        32usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).rope_scaling_type) as usize - ptr as usize },
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(tensor_split)
+            stringify!(rope_scaling_type)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).rope_freq_base) as usize - ptr as usize },
-        40usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
@@ -3590,7 +5881,7 @@ fn bindgen_test_layout_llama_context_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).rope_freq_scale) as usize - ptr as usize },
-        44usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
@@ -3599,98 +5890,88 @@ fn bindgen_test_layout_llama_context_params() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback) as usize - ptr as usize },
-        48usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(llama_context_params),
-            "::",
-            stringify!(progress_callback)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).progress_callback_user_data) as usize - ptr as usize },
-        56usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_ext_factor) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(progress_callback_user_data)
+            stringify!(yarn_ext_factor)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).low_vram) as usize - ptr as usize },
-        64usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_attn_factor) as usize - ptr as usize },
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(low_vram)
+            stringify!(yarn_attn_factor)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mul_mat_q) as usize - ptr as usize },
-        65usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_beta_fast) as usize - ptr as usize },
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(mul_mat_q)
+            stringify!(yarn_beta_fast)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).f16_kv) as usize - ptr as usize },
-        66usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_beta_slow) as usize - ptr as usize },
+        44usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(f16_kv)
+            stringify!(yarn_beta_slow)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).logits_all) as usize - ptr as usize },
-        67usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).yarn_orig_ctx) as usize - ptr as usize },
+        48usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(logits_all)
+            stringify!(yarn_orig_ctx)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vocab_only) as usize - ptr as usize },
-        68usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).mul_mat_q) as usize - ptr as usize },
+        52usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(vocab_only)
+            stringify!(mul_mat_q)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).use_mmap) as usize - ptr as usize },
-        69usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).f16_kv) as usize - ptr as usize },
+        53usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(use_mmap)
+            stringify!(f16_kv)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).use_mlock) as usize - ptr as usize },
-        70usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).logits_all) as usize - ptr as usize },
+        54usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
             "::",
-            stringify!(use_mlock)
+            stringify!(logits_all)
         )
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).embedding) as usize - ptr as usize },
-        71usize,
+        55usize,
         concat!(
             "Offset of field: ",
             stringify!(llama_context_params),
@@ -3699,24 +5980,6 @@ fn bindgen_test_layout_llama_context_params() {
         )
     );
 }
-pub const llama_ftype_LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1: llama_ftype = 3;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: llama_ftype = 4;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q8_0: llama_ftype = 7;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_0: llama_ftype = 8;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_1: llama_ftype = 9;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q2_K: llama_ftype = 10;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_S: llama_ftype = 11;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_M: llama_ftype = 12;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q3_K_L: llama_ftype = 13;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_S: llama_ftype = 14;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
-pub const llama_ftype_LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
-pub type llama_ftype = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct llama_model_quantize_params {
@@ -3724,6 +5987,8 @@ pub struct llama_model_quantize_params {
     pub ftype: llama_ftype,
     pub allow_requantize: bool,
     pub quantize_output_tensor: bool,
+    pub only_copy: bool,
+    pub pure_: bool,
 }
 #[test]
 fn bindgen_test_layout_llama_model_quantize_params() {
@@ -3780,6 +6045,26 @@ fn bindgen_test_layout_llama_model_quantize_params() {
             stringify!(quantize_output_tensor)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).only_copy) as usize - ptr as usize },
+        10usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_quantize_params),
+            "::",
+            stringify!(only_copy)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).pure_) as usize - ptr as usize },
+        11usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_model_quantize_params),
+            "::",
+            stringify!(pure_)
+        )
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -3955,7 +6240,7 @@ fn bindgen_test_layout_llama_timings() {
     );
 }
 extern "C" {
-    pub fn llama_max_devices() -> ::std::os::raw::c_int;
+    pub fn llama_model_default_params() -> llama_model_params;
 }
 extern "C" {
     pub fn llama_context_default_params() -> llama_context_params;
@@ -3963,6 +6248,36 @@ extern "C" {
 extern "C" {
     pub fn llama_model_quantize_default_params() -> llama_model_quantize_params;
 }
+extern "C" {
+    pub fn llama_backend_init(numa: bool);
+}
+extern "C" {
+    pub fn llama_backend_free();
+}
+extern "C" {
+    pub fn llama_load_model_from_file(
+        path_model: *const ::std::os::raw::c_char,
+        params: llama_model_params,
+    ) -> *mut llama_model;
+}
+extern "C" {
+    pub fn llama_free_model(model: *mut llama_model);
+}
+extern "C" {
+    pub fn llama_new_context_with_model(
+        model: *mut llama_model,
+        params: llama_context_params,
+    ) -> *mut llama_context;
+}
+extern "C" {
+    pub fn llama_free(ctx: *mut llama_context);
+}
+extern "C" {
+    pub fn llama_time_us() -> i64;
+}
+extern "C" {
+    pub fn llama_max_devices() -> ::std::os::raw::c_int;
+}
 extern "C" {
     pub fn llama_mmap_supported() -> bool;
 }
@@ -3970,66 +6285,284 @@ extern "C" {
     pub fn llama_mlock_supported() -> bool;
 }
 extern "C" {
-    pub fn llama_backend_init(numa: bool);
+    pub fn llama_get_model(ctx: *const llama_context) -> *const llama_model;
 }
 extern "C" {
-    pub fn llama_backend_free();
+    pub fn llama_n_ctx(ctx: *const llama_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_time_us() -> i64;
+    pub fn llama_vocab_type(model: *const llama_model) -> llama_vocab_type;
+}
+extern "C" {
+    pub fn llama_n_vocab(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_n_ctx_train(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_n_embd(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_rope_freq_scale_train(model: *const llama_model) -> f32;
+}
+extern "C" {
+    pub fn llama_model_meta_val_str(
+        model: *const llama_model,
+        key: *const ::std::os::raw::c_char,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_meta_count(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_meta_key_by_index(
+        model: *const llama_model,
+        i: ::std::os::raw::c_int,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_meta_val_str_by_index(
+        model: *const llama_model,
+        i: ::std::os::raw::c_int,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_desc(
+        model: *const llama_model,
+        buf: *mut ::std::os::raw::c_char,
+        buf_size: usize,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_size(model: *const llama_model) -> u64;
+}
+extern "C" {
+    pub fn llama_model_n_params(model: *const llama_model) -> u64;
+}
+extern "C" {
+    pub fn llama_get_model_tensor(
+        model: *mut llama_model,
+        name: *const ::std::os::raw::c_char,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn llama_model_quantize(
+        fname_inp: *const ::std::os::raw::c_char,
+        fname_out: *const ::std::os::raw::c_char,
+        params: *const llama_model_quantize_params,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_apply_lora_from_file(
+        ctx: *mut llama_context,
+        path_lora: *const ::std::os::raw::c_char,
+        scale: f32,
+        path_base_model: *const ::std::os::raw::c_char,
+        n_threads: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_model_apply_lora_from_file(
+        model: *const llama_model,
+        path_lora: *const ::std::os::raw::c_char,
+        scale: f32,
+        path_base_model: *const ::std::os::raw::c_char,
+        n_threads: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_kv_cache_view_cell {
+    pub pos: llama_pos,
+}
+#[test]
+fn bindgen_test_layout_llama_kv_cache_view_cell() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_kv_cache_view_cell> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_kv_cache_view_cell>(),
+        4usize,
+        concat!("Size of: ", stringify!(llama_kv_cache_view_cell))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_kv_cache_view_cell>(),
+        4usize,
+        concat!("Alignment of ", stringify!(llama_kv_cache_view_cell))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).pos) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view_cell),
+            "::",
+            stringify!(pos)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_kv_cache_view {
+    pub n_cells: i32,
+    pub n_max_seq: i32,
+    pub token_count: i32,
+    pub used_cells: i32,
+    pub max_contiguous: i32,
+    pub max_contiguous_idx: i32,
+    pub cells: *mut llama_kv_cache_view_cell,
+    pub cells_sequences: *mut llama_seq_id,
+}
+#[test]
+fn bindgen_test_layout_llama_kv_cache_view() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_kv_cache_view> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_kv_cache_view>(),
+        40usize,
+        concat!("Size of: ", stringify!(llama_kv_cache_view))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_kv_cache_view>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_kv_cache_view))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_cells) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(n_cells)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_max_seq) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(n_max_seq)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).token_count) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(token_count)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).used_cells) as usize - ptr as usize },
+        12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(used_cells)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_contiguous) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(max_contiguous)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_contiguous_idx) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(max_contiguous_idx)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).cells) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(cells)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).cells_sequences) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_kv_cache_view),
+            "::",
+            stringify!(cells_sequences)
+        )
+    );
 }
 extern "C" {
-    pub fn llama_load_model_from_file(
-        path_model: *const ::std::os::raw::c_char,
-        params: llama_context_params,
-    ) -> *mut llama_model;
+    pub fn llama_kv_cache_view_init(
+        ctx: *const llama_context,
+        n_max_seq: i32,
+    ) -> llama_kv_cache_view;
 }
 extern "C" {
-    pub fn llama_free_model(model: *mut llama_model);
+    pub fn llama_kv_cache_view_free(view: *mut llama_kv_cache_view);
 }
 extern "C" {
-    pub fn llama_new_context_with_model(
-        model: *mut llama_model,
-        params: llama_context_params,
-    ) -> *mut llama_context;
+    pub fn llama_kv_cache_view_update(ctx: *const llama_context, view: *mut llama_kv_cache_view);
 }
 extern "C" {
-    pub fn llama_init_from_file(
-        path_model: *const ::std::os::raw::c_char,
-        params: llama_context_params,
-    ) -> *mut llama_context;
+    pub fn llama_get_kv_cache_token_count(ctx: *const llama_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_free(ctx: *mut llama_context);
+    pub fn llama_get_kv_cache_used_cells(ctx: *const llama_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_model_quantize(
-        fname_inp: *const ::std::os::raw::c_char,
-        fname_out: *const ::std::os::raw::c_char,
-        params: *const llama_model_quantize_params,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_kv_cache_clear(ctx: *mut llama_context);
 }
 extern "C" {
-    pub fn llama_apply_lora_from_file(
+    pub fn llama_kv_cache_seq_rm(
         ctx: *mut llama_context,
-        path_lora: *const ::std::os::raw::c_char,
-        path_base_model: *const ::std::os::raw::c_char,
-        n_threads: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+        seq_id: llama_seq_id,
+        p0: llama_pos,
+        p1: llama_pos,
+    );
 }
 extern "C" {
-    pub fn llama_model_apply_lora_from_file(
-        model: *const llama_model,
-        path_lora: *const ::std::os::raw::c_char,
-        path_base_model: *const ::std::os::raw::c_char,
-        n_threads: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_kv_cache_seq_cp(
+        ctx: *mut llama_context,
+        seq_id_src: llama_seq_id,
+        seq_id_dst: llama_seq_id,
+        p0: llama_pos,
+        p1: llama_pos,
+    );
 }
 extern "C" {
-    pub fn llama_get_kv_cache_token_count(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_kv_cache_seq_keep(ctx: *mut llama_context, seq_id: llama_seq_id);
 }
 extern "C" {
-    pub fn llama_set_rng_seed(ctx: *mut llama_context, seed: u32);
+    pub fn llama_kv_cache_seq_shift(
+        ctx: *mut llama_context,
+        seq_id: llama_seq_id,
+        p0: llama_pos,
+        p1: llama_pos,
+        delta: llama_pos,
+    );
 }
 extern "C" {
     pub fn llama_get_state_size(ctx: *const llama_context) -> usize;
@@ -4060,105 +6593,106 @@ extern "C" {
 extern "C" {
     pub fn llama_eval(
         ctx: *mut llama_context,
-        tokens: *const llama_token,
-        n_tokens: ::std::os::raw::c_int,
+        tokens: *mut llama_token,
+        n_tokens: i32,
         n_past: ::std::os::raw::c_int,
-        n_threads: ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn llama_eval_embd(
         ctx: *mut llama_context,
-        embd: *const f32,
-        n_tokens: ::std::os::raw::c_int,
+        embd: *mut f32,
+        n_tokens: i32,
         n_past: ::std::os::raw::c_int,
-        n_threads: ::std::os::raw::c_int,
     ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_eval_export(
-        ctx: *mut llama_context,
-        fname: *const ::std::os::raw::c_char,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_batch_get_one(
+        tokens: *mut llama_token,
+        n_tokens: i32,
+        pos_0: llama_pos,
+        seq_id: llama_seq_id,
+    ) -> llama_batch;
 }
 extern "C" {
-    pub fn llama_tokenize(
-        ctx: *mut llama_context,
-        text: *const ::std::os::raw::c_char,
-        tokens: *mut llama_token,
-        n_max_tokens: ::std::os::raw::c_int,
-        add_bos: bool,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_batch_init(n_tokens: i32, embd: i32, n_seq_max: i32) -> llama_batch;
 }
 extern "C" {
-    pub fn llama_tokenize_with_model(
-        model: *const llama_model,
-        text: *const ::std::os::raw::c_char,
-        tokens: *mut llama_token,
-        n_max_tokens: ::std::os::raw::c_int,
-        add_bos: bool,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_batch_free(batch: llama_batch);
 }
 extern "C" {
-    pub fn llama_n_vocab(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_decode(ctx: *mut llama_context, batch: llama_batch) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_n_ctx(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_set_n_threads(ctx: *mut llama_context, n_threads: u32, n_threads_batch: u32);
 }
 extern "C" {
-    pub fn llama_n_embd(ctx: *const llama_context) -> ::std::os::raw::c_int;
+    pub fn llama_get_logits(ctx: *mut llama_context) -> *mut f32;
 }
 extern "C" {
-    pub fn llama_n_vocab_from_model(model: *const llama_model) -> ::std::os::raw::c_int;
+    pub fn llama_get_logits_ith(ctx: *mut llama_context, i: i32) -> *mut f32;
 }
 extern "C" {
-    pub fn llama_n_ctx_from_model(model: *const llama_model) -> ::std::os::raw::c_int;
+    pub fn llama_get_embeddings(ctx: *mut llama_context) -> *mut f32;
 }
 extern "C" {
-    pub fn llama_n_embd_from_model(model: *const llama_model) -> ::std::os::raw::c_int;
+    pub fn llama_token_get_text(
+        model: *const llama_model,
+        token: llama_token,
+    ) -> *const ::std::os::raw::c_char;
 }
 extern "C" {
-    pub fn llama_get_vocab(
-        ctx: *const llama_context,
-        strings: *mut *const ::std::os::raw::c_char,
-        scores: *mut f32,
-        capacity: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_token_get_score(model: *const llama_model, token: llama_token) -> f32;
 }
 extern "C" {
-    pub fn llama_get_vocab_from_model(
-        model: *const llama_model,
-        strings: *mut *const ::std::os::raw::c_char,
-        scores: *mut f32,
-        capacity: ::std::os::raw::c_int,
-    ) -> ::std::os::raw::c_int;
+    pub fn llama_token_get_type(model: *const llama_model, token: llama_token) -> llama_token_type;
 }
 extern "C" {
-    pub fn llama_get_logits(ctx: *mut llama_context) -> *mut f32;
+    pub fn llama_token_bos(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_get_embeddings(ctx: *mut llama_context) -> *mut f32;
+    pub fn llama_token_eos(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_to_str(
-        ctx: *const llama_context,
-        token: llama_token,
-    ) -> *const ::std::os::raw::c_char;
+    pub fn llama_token_nl(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_to_str_with_model(
-        model: *const llama_model,
-        token: llama_token,
-    ) -> *const ::std::os::raw::c_char;
+    pub fn llama_add_bos_token(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_add_eos_token(model: *const llama_model) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn llama_token_prefix(model: *const llama_model) -> llama_token;
+}
+extern "C" {
+    pub fn llama_token_middle(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_bos() -> llama_token;
+    pub fn llama_token_suffix(model: *const llama_model) -> llama_token;
 }
 extern "C" {
-    pub fn llama_token_eos() -> llama_token;
+    pub fn llama_token_eot(model: *const llama_model) -> llama_token;
+}
+extern "C" {
+    #[doc = " @details Convert the provided text into tokens.\n @param tokens The tokens pointer must be large enough to hold the resulting tokens.\n @return Returns the number of tokens on success, no more than n_max_tokens\n @return Returns a negative number on failure - the number of tokens that would have been returned\n @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.\n                Does not insert a leading space."]
+    pub fn llama_tokenize(
+        model: *const llama_model,
+        text: *const ::std::os::raw::c_char,
+        text_len: ::std::os::raw::c_int,
+        tokens: *mut llama_token,
+        n_max_tokens: ::std::os::raw::c_int,
+        add_bos: bool,
+        special: bool,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn llama_token_nl() -> llama_token;
+    pub fn llama_token_to_piece(
+        model: *const llama_model,
+        token: llama_token,
+        buf: *mut ::std::os::raw::c_char,
+        length: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn llama_grammar_init(
@@ -4171,24 +6705,21 @@ extern "C" {
     pub fn llama_grammar_free(grammar: *mut llama_grammar);
 }
 extern "C" {
-    #[doc = " @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix."]
-    pub fn llama_sample_repetition_penalty(
-        ctx: *mut llama_context,
-        candidates: *mut llama_token_data_array,
-        last_tokens: *const llama_token,
-        last_tokens_size: usize,
-        penalty: f32,
-    );
+    pub fn llama_grammar_copy(grammar: *const llama_grammar) -> *mut llama_grammar;
 }
 extern "C" {
-    #[doc = " @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."]
-    pub fn llama_sample_frequency_and_presence_penalties(
+    pub fn llama_set_rng_seed(ctx: *mut llama_context, seed: u32);
+}
+extern "C" {
+    #[doc = " @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.\n @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details."]
+    pub fn llama_sample_repetition_penalties(
         ctx: *mut llama_context,
         candidates: *mut llama_token_data_array,
         last_tokens: *const llama_token,
-        last_tokens_size: usize,
-        alpha_frequency: f32,
-        alpha_presence: f32,
+        penalty_last_n: usize,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
     );
 }
 extern "C" {
@@ -4222,6 +6753,15 @@ extern "C" {
         min_keep: usize,
     );
 }
+extern "C" {
+    #[doc = " @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841"]
+    pub fn llama_sample_min_p(
+        ctx: *mut llama_context,
+        candidates: *mut llama_token_data_array,
+        p: f32,
+        min_keep: usize,
+    );
+}
 extern "C" {
     #[doc = " @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/."]
     pub fn llama_sample_tail_free(
@@ -4240,6 +6780,13 @@ extern "C" {
         min_keep: usize,
     );
 }
+extern "C" {
+    pub fn llama_sample_temp(
+        ctx: *mut llama_context,
+        candidates: *mut llama_token_data_array,
+        temp: f32,
+    );
+}
 extern "C" {
     pub fn llama_sample_temperature(
         ctx: *mut llama_context,
@@ -4277,7 +6824,7 @@ extern "C" {
     ) -> llama_token;
 }
 extern "C" {
-    #[doc = " @details Selects the token with the highest probability."]
+    #[doc = " @details Selects the token with the highest probability.\n          Does not compute the token probabilities. Use llama_sample_softmax() instead."]
     pub fn llama_sample_token_greedy(
         ctx: *mut llama_context,
         candidates: *mut llama_token_data_array,
@@ -4298,6 +6845,146 @@ extern "C" {
         token: llama_token,
     );
 }
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_beam_view {
+    pub tokens: *const llama_token,
+    pub n_tokens: usize,
+    pub p: f32,
+    pub eob: bool,
+}
+#[test]
+fn bindgen_test_layout_llama_beam_view() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_beam_view> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_beam_view>(),
+        24usize,
+        concat!("Size of: ", stringify!(llama_beam_view))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_beam_view>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_beam_view))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).tokens) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(tokens)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_tokens) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(n_tokens)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).p) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(p)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).eob) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beam_view),
+            "::",
+            stringify!(eob)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct llama_beams_state {
+    pub beam_views: *mut llama_beam_view,
+    pub n_beams: usize,
+    pub common_prefix_length: usize,
+    pub last_call: bool,
+}
+#[test]
+fn bindgen_test_layout_llama_beams_state() {
+    const UNINIT: ::std::mem::MaybeUninit<llama_beams_state> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<llama_beams_state>(),
+        32usize,
+        concat!("Size of: ", stringify!(llama_beams_state))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<llama_beams_state>(),
+        8usize,
+        concat!("Alignment of ", stringify!(llama_beams_state))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).beam_views) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(beam_views)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_beams) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(n_beams)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).common_prefix_length) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(common_prefix_length)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).last_call) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(llama_beams_state),
+            "::",
+            stringify!(last_call)
+        )
+    );
+}
+pub type llama_beam_search_callback_fn_t = ::std::option::Option<
+    unsafe extern "C" fn(callback_data: *mut ::std::os::raw::c_void, arg1: llama_beams_state),
+>;
+extern "C" {
+    #[doc = " @details Deterministically returns entire sentence constructed by a beam search.\n @param ctx Pointer to the llama_context.\n @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.\n @param callback_data A pointer that is simply passed back to callback.\n @param n_beams Number of beams to use.\n @param n_past Number of tokens already evaluated.\n @param n_predict Maximum number of tokens to predict. EOS may occur earlier."]
+    pub fn llama_beam_search(
+        ctx: *mut llama_context,
+        callback: llama_beam_search_callback_fn_t,
+        callback_data: *mut ::std::os::raw::c_void,
+        n_beams: usize,
+        n_past: ::std::os::raw::c_int,
+        n_predict: ::std::os::raw::c_int,
+    );
+}
 extern "C" {
     pub fn llama_get_timings(ctx: *mut llama_context) -> llama_timings;
 }
@@ -4310,3 +6997,78 @@ extern "C" {
 extern "C" {
     pub fn llama_print_system_info() -> *const ::std::os::raw::c_char;
 }
+extern "C" {
+    pub fn llama_log_set(log_callback: ggml_log_callback, user_data: *mut ::std::os::raw::c_void);
+}
+extern "C" {
+    pub fn llama_dump_timing_info_yaml(stream: *mut FILE, ctx: *const llama_context);
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_backend_buffer {
+    pub _address: u8,
+}
+pub type __builtin_va_list = [__va_list_tag; 1usize];
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct __va_list_tag {
+    pub gp_offset: ::std::os::raw::c_uint,
+    pub fp_offset: ::std::os::raw::c_uint,
+    pub overflow_arg_area: *mut ::std::os::raw::c_void,
+    pub reg_save_area: *mut ::std::os::raw::c_void,
+}
+#[test]
+fn bindgen_test_layout___va_list_tag() {
+    const UNINIT: ::std::mem::MaybeUninit<__va_list_tag> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<__va_list_tag>(),
+        24usize,
+        concat!("Size of: ", stringify!(__va_list_tag))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<__va_list_tag>(),
+        8usize,
+        concat!("Alignment of ", stringify!(__va_list_tag))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gp_offset) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(gp_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).fp_offset) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(fp_offset)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).overflow_arg_area) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(overflow_arg_area)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).reg_save_area) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(__va_list_tag),
+            "::",
+            stringify!(reg_save_area)
+        )
+    );
+}
diff --git a/crates/llm-chain-llama/examples/alpaca.rs b/crates/llm-chain-llama/examples/alpaca.rs
index 6e581976..3b92d0f6 100644
--- a/crates/llm-chain-llama/examples/alpaca.rs
+++ b/crates/llm-chain-llama/examples/alpaca.rs
@@ -1,7 +1,7 @@
 use llm_chain::executor;
 use llm_chain::{parameters, prompt};
 
-#[tokio::main(flavor = "current_thread")]
+#[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let exec = executor!(llama)?;
     let res = prompt!("Write a hypothetical weather report for {season} in {location}.")
diff --git a/crates/llm-chain-llama/examples/few_shot.rs b/crates/llm-chain-llama/examples/few_shot.rs
index e3f3e389..dcc09d22 100644
--- a/crates/llm-chain-llama/examples/few_shot.rs
+++ b/crates/llm-chain-llama/examples/few_shot.rs
@@ -1,4 +1,3 @@
-use llm_chain::options;
 use llm_chain::prompt::Conversation;
 use llm_chain::{chains::conversation::Chain, executor, parameters, prompt, step::Step};
 /// This example demonstrates how to use the llm-chain for few-shot prompting
@@ -10,12 +9,7 @@ use llm_chain::{chains::conversation::Chain, executor, parameters, prompt, step:
 /// Make sure to have the env var 'LLM_CHAIN_MODEL' set
 #[tokio::main(flavor = "multi_thread", worker_threads = 1)]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let opts = options!(
-        NThreads: 4_usize,
-        StopSequence: vec!["\n".to_string()]
-    );
-
-    let exec_1 = executor!(llama, opts.clone())?;
+    let exec_1 = executor!(llama)?;
 
     let user_prompt =
         "Take the last letters of the words in '{{ full_name }}' and concatenate them";
@@ -47,7 +41,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Define the step
     let step = Step::for_prompt_template(prompt!(user: user_prompt));
     // Execute the chain.
-    let exec_2 = executor!(llama, opts)?;
+    let exec_2 = executor!(llama)?;
     let res = chain
         .send_message(step, &parameters!().with("full_name", "Elon Musk"), &exec_2)
         .await?;
diff --git a/crates/llm-chain-llama/examples/map_reduce_llama.rs b/crates/llm-chain-llama/examples/map_reduce_llama.rs
index 884304da..87102cff 100644
--- a/crates/llm-chain-llama/examples/map_reduce_llama.rs
+++ b/crates/llm-chain-llama/examples/map_reduce_llama.rs
@@ -1,14 +1,35 @@
 use llm_chain::chains::map_reduce::Chain;
 use llm_chain::executor;
+use llm_chain::options;
 use llm_chain::{prompt, step::Step, Parameters};
 
-#[tokio::main(flavor = "current_thread")]
+#[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let exec = executor!(llama)?;
+    let opts = options!(
+        ModelType: "llama",
+        MaxContextSize: 4096_usize,
+        NThreads: 4_usize,
+        MaxTokens: 2048_usize,
+        MaxBatchSize: 4096_usize,
+        TopK: 40_i32,
+        TopP: 0.95,
+        TfsZ: 1.0,
+        TypicalP: 1.0,
+        Temperature: 0.8,
+        RepeatPenalty: 1.1,
+        RepeatPenaltyLastN: 64_usize,
+        FrequencyPenalty: 0.0,
+        PresencePenalty: 0.0,
+        Mirostat: 0_i32,
+        MirostatTau: 5.0,
+        MirostatEta: 0.1,
+        PenalizeNl: true,
+        StopSequence: vec!["\n\n".to_string()]
+    );
+    let exec = executor!(llama, opts.clone())?;
     let map_prompt = Step::for_prompt_template(prompt!("== ARTICLE ==\n{{text}}== SUMMARY ==\n"));
     let reduce_prompt =
         Step::for_prompt_template(prompt!("== ARTICLE ==\n{{text}}== FINAL SUMMARY ==\n"));
-
     let chain = Chain::new(map_prompt, reduce_prompt);
     let article = include_str!("article_to_summarize.md");
     let docs = vec![Parameters::new_with_text(article)];
diff --git a/crates/llm-chain-llama/examples/simple_llama.rs b/crates/llm-chain-llama/examples/simple_llama.rs
index 079c0acb..28cdd6cf 100644
--- a/crates/llm-chain-llama/examples/simple_llama.rs
+++ b/crates/llm-chain-llama/examples/simple_llama.rs
@@ -30,7 +30,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         ModelType: "llama",
         MaxContextSize: 512_usize,
         NThreads: 4_usize,
-        MaxTokens: 0_usize,
+        MaxTokens: 512_usize,
         TopK: 40_i32,
         TopP: 0.95,
         TfsZ: 1.0,
diff --git a/crates/llm-chain-llama/examples/stream.rs b/crates/llm-chain-llama/examples/stream.rs
index 0274e730..906f16b3 100644
--- a/crates/llm-chain-llama/examples/stream.rs
+++ b/crates/llm-chain-llama/examples/stream.rs
@@ -4,10 +4,9 @@ use llm_chain::{executor, parameters, prompt};
 /// This example demonstrates how to use the llm-chain-llama crate to generate streaming text using a
 /// LLaMA model.
 ///
-/// Usage: cargo run --example simple path/to/llama-or-alpaca-model
+/// Usage: cargo run --example stream
 ///
-/// For example, if the model is located at "/models/llama"
-/// cargo run --example simple /models/llama
+/// Make sure to have the env var 'LLM_CHAIN_MODEL' set.
 #[tokio::main(flavor = "current_thread")]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let exec = executor!(llama)?;
diff --git a/crates/llm-chain-llama/src/batch.rs b/crates/llm-chain-llama/src/batch.rs
new file mode 100644
index 00000000..17af1c73
--- /dev/null
+++ b/crates/llm-chain-llama/src/batch.rs
@@ -0,0 +1,118 @@
+use llm_chain_llama_sys::{llama_batch, llama_batch_free, llama_seq_id};
+use std::ptr::null_mut;
+
+#[derive(Debug, Clone)]
+#[allow(dead_code)]
+pub struct LlamaBatch {
+    n_tokens: i32,
+    token: Vec<i32>,
+    embd: Vec<f32>,
+    pos: Vec<i32>,
+    n_seq_id: Vec<i32>,
+    seq_id: Vec<Vec<i32>>,
+    logits: Vec<bool>,
+    all_pos_0: i32,
+    all_pos_1: i32,
+    all_seq_id: i32,
+}
+
+impl LlamaBatch {
+    pub fn new_with_tokens(tokens: Vec<i32>, max_seq: i32) -> Self {
+        let pos = (0..tokens.len()).map(|p| p as i32).collect::<Vec<_>>();
+        let embd = vec![];
+        let n_seq_id = vec![max_seq; tokens.len()];
+        let seq_id = vec![vec![0; tokens.len()]; tokens.len()];
+        let logits = vec![false; tokens.len()];
+        let all_pos_0 = 0;
+        let all_pos_1 = 0;
+        let all_seq_id = 0;
+
+        Self {
+            n_tokens: tokens.len() as i32,
+            token: tokens,
+            embd,
+            pos,
+            n_seq_id,
+            seq_id,
+            logits,
+            all_pos_0,
+            all_pos_1,
+            all_seq_id,
+        }
+    }
+
+    pub fn new_with_token(token: i32, pos: i32) -> Self {
+        Self {
+            n_tokens: 1,
+            token: vec![token],
+            embd: vec![],
+            pos: vec![pos],
+            n_seq_id: vec![1],
+            seq_id: vec![vec![0]],
+            logits: vec![true],
+            all_pos_0: 0,
+            all_pos_1: 0,
+            all_seq_id: 0,
+        }
+    }
+
+    pub fn token_count(&self) -> usize {
+        self.n_tokens as usize
+    }
+
+    pub fn enable_logits(&mut self, pos: usize) {
+        self.logits[pos] = true;
+    }
+}
+
+impl Drop for LlamaBatch {
+    fn drop(&mut self) {
+        unsafe {
+            llama_batch_free(self.into());
+        }
+    }
+}
+
+fn convert_llama_batch(batch: &LlamaBatch) -> llama_batch {
+    let n_tokens = batch.n_tokens;
+    let token_ptr = Box::leak(batch.token.clone().into_boxed_slice()).as_mut_ptr();
+    let embd_ptr = if batch.embd.is_empty() {
+        null_mut()
+    } else {
+        Box::leak(batch.embd.clone().into_boxed_slice()).as_mut_ptr()
+    };
+    let pos_ptr = Box::leak(batch.pos.clone().into_boxed_slice()).as_mut_ptr();
+    let n_seq_id_ptr = Box::leak(batch.n_seq_id.clone().into_boxed_slice()).as_mut_ptr();
+    let raw_pointers = batch
+        .seq_id
+        .clone()
+        .into_iter()
+        .map(|inner_vec| Box::leak(inner_vec.into_boxed_slice()).as_mut_ptr())
+        .collect::<Vec<*mut llama_seq_id>>();
+    let seq_id_ptr = Box::leak(raw_pointers.into_boxed_slice()).as_mut_ptr();
+    let logits_ptr = Box::leak(batch.logits.clone().into_boxed_slice()).as_mut_ptr();
+    llama_batch {
+        n_tokens,
+        token: token_ptr,
+        embd: embd_ptr,
+        pos: pos_ptr,
+        n_seq_id: n_seq_id_ptr,
+        seq_id: seq_id_ptr,
+        logits: logits_ptr as *mut i8,
+        all_pos_0: batch.all_pos_0,
+        all_pos_1: batch.all_pos_1,
+        all_seq_id: batch.all_seq_id,
+    }
+}
+
+impl From<&LlamaBatch> for llama_batch {
+    fn from(batch: &LlamaBatch) -> Self {
+        convert_llama_batch(batch)
+    }
+}
+
+impl From<&mut LlamaBatch> for llama_batch {
+    fn from(batch: &mut LlamaBatch) -> Self {
+        convert_llama_batch(batch)
+    }
+}
diff --git a/crates/llm-chain-llama/src/context.rs b/crates/llm-chain-llama/src/context.rs
index cbae028f..a84894ce 100644
--- a/crates/llm-chain-llama/src/context.rs
+++ b/crates/llm-chain-llama/src/context.rs
@@ -1,20 +1,22 @@
-use std::{
-    ffi::{CStr, CString},
-    ptr::null_mut,
-};
+use std::ffi::{CStr, CString};
 
+use crate::batch;
+use crate::model::ModelParams;
 use crate::options::LlamaInvocation;
 use anyhow::Result;
 use llm_chain_llama_sys::{
-    llama_context, llama_context_default_params, llama_context_params, llama_eval, llama_free,
-    llama_get_logits, llama_init_from_file, llama_n_vocab,
-    llama_sample_frequency_and_presence_penalties, llama_sample_repetition_penalty,
+    llama_context, llama_context_default_params, llama_context_params, llama_decode, llama_eval,
+    llama_free, llama_get_logits, llama_get_logits_ith, llama_load_model_from_file, llama_model,
+    llama_n_vocab, llama_new_context_with_model, llama_sample_repetition_penalties,
     llama_sample_tail_free, llama_sample_temperature, llama_sample_token,
     llama_sample_token_greedy, llama_sample_token_mirostat, llama_sample_token_mirostat_v2,
     llama_sample_top_k, llama_sample_top_p, llama_sample_typical, llama_token_data,
-    llama_token_data_array, llama_token_nl, llama_token_to_str,
+    llama_token_data_array, llama_token_eos, llama_token_get_text, llama_token_nl,
+    llama_token_to_piece,
 };
 
+pub use batch::LlamaBatch;
+
 #[derive(Debug, thiserror::Error)]
 #[error("LLAMA.cpp returned error-code {0}")]
 pub struct LLAMACPPErrorCode(i32);
@@ -22,13 +24,22 @@ pub struct LLAMACPPErrorCode(i32);
 // Represents the configuration parameters for a LLamaContext.
 #[derive(Debug, Clone)]
 pub struct ContextParams {
-    pub n_parts: i32,
-    pub n_ctx: i32,
-    pub seed: i32,
+    pub seed: u32,
+    pub n_ctx: u32,
+    pub n_batch: u32,
+    pub n_threads: u32,
+    pub n_threads_batch: u32,
+    pub rope_scaling_type: i8,
+    pub rope_freq_base: f32,
+    pub rope_freq_scale: f32,
+    pub yarn_ext_factor: f32,
+    pub yarn_attn_factor: f32,
+    pub yarn_beta_fast: f32,
+    pub yarn_beta_slow: f32,
+    pub yarn_orig_ctx: u32,
+    pub mul_mat_q: bool,
     pub f16_kv: bool,
-    pub vocab_only: bool,
-    pub use_mlock: bool,
-    pub use_mmap: bool,
+    pub logits_all: bool,
     pub embedding: bool,
 }
 
@@ -57,17 +68,23 @@ impl Default for ContextParams {
 impl From<ContextParams> for llama_context_params {
     fn from(params: ContextParams) -> Self {
         llama_context_params {
-            n_parts: params.n_parts,
-            n_ctx: params.n_ctx,
             seed: params.seed,
+            n_ctx: params.n_ctx,
+            n_batch: params.n_batch,
+            n_threads: params.n_threads,
+            n_threads_batch: params.n_threads_batch,
+            rope_scaling_type: params.rope_scaling_type,
+            rope_freq_base: params.rope_freq_base,
+            rope_freq_scale: params.rope_freq_scale,
+            yarn_ext_factor: params.yarn_ext_factor,
+            yarn_attn_factor: params.yarn_attn_factor,
+            yarn_beta_fast: params.yarn_beta_fast,
+            yarn_beta_slow: params.yarn_beta_slow,
+            yarn_orig_ctx: params.yarn_orig_ctx,
+            mul_mat_q: params.mul_mat_q,
             f16_kv: params.f16_kv,
             logits_all: false,
-            vocab_only: params.vocab_only,
-            use_mlock: params.use_mlock,
-            use_mmap: params.use_mmap,
             embedding: params.embedding,
-            progress_callback: None,
-            progress_callback_user_data: null_mut(),
         }
     }
 }
@@ -75,13 +92,22 @@ impl From<ContextParams> for llama_context_params {
 impl From<llama_context_params> for ContextParams {
     fn from(params: llama_context_params) -> Self {
         ContextParams {
-            n_ctx: params.n_ctx,
-            n_parts: params.n_parts,
             seed: params.seed,
+            n_ctx: params.n_ctx,
+            n_batch: params.n_batch,
+            n_threads: params.n_threads,
+            n_threads_batch: params.n_threads_batch,
+            rope_scaling_type: params.rope_scaling_type,
+            rope_freq_base: params.rope_freq_base,
+            rope_freq_scale: params.rope_freq_scale,
+            yarn_ext_factor: params.yarn_ext_factor,
+            yarn_attn_factor: params.yarn_attn_factor,
+            yarn_beta_fast: params.yarn_beta_fast,
+            yarn_beta_slow: params.yarn_beta_slow,
+            yarn_orig_ctx: params.yarn_orig_ctx,
+            mul_mat_q: params.mul_mat_q,
             f16_kv: params.f16_kv,
-            vocab_only: params.vocab_only,
-            use_mlock: params.use_mlock,
-            use_mmap: params.use_mmap,
+            logits_all: params.logits_all,
             embedding: params.embedding,
         }
     }
@@ -90,21 +116,31 @@ impl From<llama_context_params> for ContextParams {
 // Represents the LLamaContext which wraps FFI calls to the llama.cpp library.
 pub(crate) struct LLamaContext {
     ctx: *mut llama_context,
+    pub model: *mut llama_model,
 }
 
+#[allow(dead_code)]
 impl LLamaContext {
     // Creates a new LLamaContext from the specified file and configuration parameters.
     pub fn from_file_and_params(
         path: &str,
-        params: Option<&ContextParams>,
+        model_params: Option<&ModelParams>,
+        context_params: Option<&ContextParams>,
     ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
         let path = CString::new(path).expect("could not convert to CString");
-        let params = ContextParams::or_default(params);
-        let ctx = unsafe { llama_init_from_file(path.into_raw() as *const i8, params) };
+        let model_params = ModelParams::or_default(model_params);
+        let model =
+            unsafe { llama_load_model_from_file(path.into_raw() as *const i8, model_params) };
+        if model.is_null() {
+            return Err("Initializing llama model returned nullptr".into());
+        }
+
+        let context_params = ContextParams::or_default(context_params);
+        let ctx = unsafe { llama_new_context_with_model(model, context_params) };
         if ctx.is_null() {
             return Err("Initializing llama context returned nullptr".into());
         }
-        Ok(Self { ctx })
+        Ok(Self { ctx, model })
     }
 
     // Token logits obtained from the last call to llama_eval()
@@ -117,7 +153,12 @@ impl LLamaContext {
         unsafe { std::slice::from_raw_parts_mut(llama_get_logits(self.ctx), len) }.to_vec()
     }
     pub fn llama_n_vocab(&self) -> i32 {
-        unsafe { llama_n_vocab(self.ctx) }
+        unsafe { llama_n_vocab(self.model) }
+    }
+
+    pub fn llama_get_logits_ith(&self, index: usize) -> Vec<f32> {
+        let float_ptr = unsafe { llama_get_logits_ith(self.ctx, index as i32) };
+        Vec::from(unsafe { std::slice::from_raw_parts(float_ptr, self.llama_n_vocab() as usize) })
     }
 
     // Executes the LLama sampling process with the specified configuration.
@@ -127,6 +168,7 @@ impl LLamaContext {
         last_n_tokens_data: &[i32],
         last_n_tokens_size: i32,
         input: &LlamaInvocation,
+        batch_n_tokens: i32,
     ) -> i32 {
         let top_k = if input.top_k <= 0 {
             self.llama_n_vocab()
@@ -140,7 +182,7 @@ impl LLamaContext {
         };
         let n_vocab = self.llama_n_vocab() as usize;
         // only get the last row, as the sample only requires this.
-        let mut logits = self.llama_get_logits_as_slice(1, n_vocab);
+        let mut logits = self.llama_get_logits_ith((batch_n_tokens - 1) as usize);
 
         // let id : llama_token = 0;
         input
@@ -160,11 +202,11 @@ impl LLamaContext {
             size: candidates.len(),
             sorted: false,
         };
-        let nl_logit = logits[unsafe { llama_token_nl() } as usize];
+        let nl_logit = logits[unsafe { llama_token_nl(self.model) } as usize];
         let last_n_repeat = i32::min(i32::min(last_n_tokens_size, repeat_last_n), n_ctx) as usize;
 
         unsafe {
-            llama_sample_repetition_penalty(
+            llama_sample_repetition_penalties(
                 self.ctx,
                 &mut candidates_p,
                 last_n_tokens_data
@@ -172,22 +214,12 @@ impl LLamaContext {
                     .add((last_n_tokens_size - last_n_repeat as i32) as usize),
                 last_n_repeat,
                 input.repeat_penalty,
-            )
-        };
-        unsafe {
-            llama_sample_frequency_and_presence_penalties(
-                self.ctx,
-                &mut candidates_p,
-                last_n_tokens_data
-                    .as_ptr()
-                    .add((last_n_tokens_size - last_n_repeat as i32) as usize),
-                last_n_repeat,
                 input.frequency_penalty,
                 input.presence_penalty,
             )
         };
         if !input.penalize_nl {
-            logits[unsafe { llama_token_nl() as usize }] = nl_logit;
+            logits[unsafe { llama_token_nl(self.model) as usize }] = nl_logit;
         }
 
         if input.temp <= 0.0 {
@@ -231,26 +263,74 @@ impl LLamaContext {
     }
 
     pub fn llama_token_to_bytes(&self, token: &i32) -> Vec<u8> {
-        let c_ptr = unsafe { llama_token_to_str(self.ctx, *token) };
+        let c_ptr = unsafe { llama_token_get_text(self.model, *token) };
         unsafe { CStr::from_ptr(c_ptr) }.to_bytes().to_vec()
     }
 
     // Evaluates the given tokens with the specified configuration.
     pub fn llama_eval(
         &self,
-        tokens: &[i32],
+        tokens: &mut [i32],
         n_tokens: i32,
         n_past: i32,
-        input: &LlamaInvocation,
+        _input: &LlamaInvocation,
     ) -> Result<(), LLAMACPPErrorCode> {
-        let res =
-            unsafe { llama_eval(self.ctx, tokens.as_ptr(), n_tokens, n_past, input.n_threads) };
+        let res = unsafe { llama_eval(self.ctx, tokens.as_mut_ptr(), n_tokens, n_past) };
         if res == 0 {
             Ok(())
         } else {
             Err(LLAMACPPErrorCode(res))
         }
     }
+
+    // Evaluates the provided batch.
+    pub fn llama_decode(&self, batch: &LlamaBatch) -> Result<(), LLAMACPPErrorCode> {
+        let res = unsafe { llama_decode(self.ctx, batch.into()) };
+        if res == 0 {
+            Ok(())
+        } else {
+            Err(LLAMACPPErrorCode(res))
+        }
+    }
+
+    pub fn llama_token_eos(&self) -> i32 {
+        unsafe { llama_token_eos(self.model) }
+    }
+
+    pub fn llama_token_nl(&self) -> i32 {
+        unsafe { llama_token_nl(self.model) }
+    }
+
+    pub fn llama_token_to_piece(
+        &self,
+        token_id: i32,
+    ) -> Result<String, std::string::FromUtf8Error> {
+        let mut result = vec![0 as i8; 8];
+        let n_tokens = unsafe {
+            llama_token_to_piece(
+                self.model,
+                token_id,
+                result.as_mut_ptr(),
+                result.len() as i32,
+            )
+        };
+        if n_tokens < 0 {
+            result.resize(-n_tokens as usize, 0 as i8);
+            let check = unsafe {
+                llama_token_to_piece(
+                    self.model,
+                    token_id,
+                    result.as_mut_ptr(),
+                    result.len() as i32,
+                )
+            };
+            assert_eq!(check, -n_tokens);
+        } else {
+            result.resize(n_tokens as usize, 0 as i8);
+        }
+        let result_bytes: Vec<u8> = result.into_iter().map(|b| b as u8).collect();
+        String::from_utf8(result_bytes)
+    }
 }
 
 // Provides thread-safe behavior for LLamaContext.
diff --git a/crates/llm-chain-llama/src/executor.rs b/crates/llm-chain-llama/src/executor.rs
index e5118792..174580e6 100644
--- a/crates/llm-chain-llama/src/executor.rs
+++ b/crates/llm-chain-llama/src/executor.rs
@@ -1,9 +1,9 @@
 use std::marker::PhantomData;
 use std::sync::Arc;
 
-use crate::context::{ContextParams, LLamaContext};
+use crate::context::{ContextParams, LLamaContext, LlamaBatch};
 use crate::options::{get_executor_initial_opts, LlamaInvocation, DEFAULT_OPTIONS};
-use crate::tokenizer::{embedding_to_output, llama_token_eos, tokenize, tokens_to_string};
+use crate::tokenizer::{embedding_to_output, tokenize};
 
 use async_trait::async_trait;
 
@@ -61,6 +61,7 @@ impl Executor {
         tokio::task::spawn_blocking(move || {
             let context_size = context_size;
             let context = context.blocking_lock();
+
             let tokenized_stop_prompt = tokenize(
                 &context,
                 input
@@ -69,6 +70,7 @@ impl Executor {
                     .map(|x| x.as_str())
                     .unwrap_or("\n\n"),
                 false,
+                true,
             );
 
             if tokenized_stop_prompt.len() > context_size {
@@ -77,68 +79,78 @@ impl Executor {
             }
 
             let prompt_text = input.prompt.to_text();
-            let tokenized_input = tokenize(&context, prompt_text.as_str(), true);
+
+            let tokenized_input = tokenize(&context, prompt_text.as_str(), true, false);
             if tokenized_input.len() > context_size {
                 must_send!(sender, StreamSegment::Err(ExecutorError::ContextTooSmall));
                 return;
             }
 
-            // Embd contains the prompt and the completion. The longer the prompt, the shorter the completion.
+            // embd contains the prompt and the completion. The longer the
+            // prompt, the shorter the completion.
+            // It will initially contain a copy the tokenized prompt and then
+            // may be extended with the tokenized answer prefix. After each
+            // sampling the sampled token will also be added to this vector.
+            // This is done so that the sampling function has access to all the
+            // tokens which it may need for repetition penalties, etc.
             let mut embd = tokenized_input.clone();
 
-            // Evaluate the prompt in full.
+            let mut batch = LlamaBatch::new_with_tokens(tokenized_input.clone(), 1);
+            let last_idx = (batch.token_count() - 1) as usize;
+            batch.enable_logits(last_idx);
+
             bail!(
                 context
-                    .llama_eval(
-                        tokenized_input.as_slice(),
-                        tokenized_input.len() as i32,
-                        0,
-                        &input,
-                    )
+                    .llama_decode(&batch)
                     .map_err(|e| ExecutorError::InnerError(e.into())),
                 sender
             );
+            let mut n_cur = batch.token_count();
+            let mut n_used = (batch.token_count() - 1) as usize;
 
             let mut n_remaining = context_size - tokenized_input.len();
-            let mut n_used = tokenized_input.len() - 1;
             if let Some(prefix) = answer_prefix {
-                let tokenized_answer_prefix = tokenize(&context, prefix.as_str(), false);
+                let tokenized_answer_prefix = tokenize(&context, prefix.as_str(), true, true);
                 if tokenized_answer_prefix.len() > context_size {
                     must_send!(sender, StreamSegment::Err(ExecutorError::ContextTooSmall));
                     return;
                 }
-
+                let batch = LlamaBatch::new_with_tokens(tokenized_answer_prefix.clone(), 1);
                 // Evaluate the answer prefix (the role -- should be Assistant: )
                 bail!(
                     context
-                        .llama_eval(
-                            tokenized_answer_prefix.as_slice(),
-                            tokenized_answer_prefix.len() as i32,
-                            n_used as i32,
-                            &input,
-                        )
+                        .llama_decode(&batch)
                         .map_err(|e| ExecutorError::InnerError(e.into())),
                     sender
                 );
                 n_remaining -= tokenized_answer_prefix.len();
-                n_used += tokenized_answer_prefix.len();
                 embd.extend(tokenized_answer_prefix);
+                n_cur += batch.token_count();
+                n_used += (batch.token_count() - 1) as usize;
             }
             embd.resize(context_size, 0);
-            let token_eos = llama_token_eos();
+            let token_eos = context.llama_token_eos();
+
             let mut stop_sequence_i = 0;
+            let mut n_batch = batch.token_count();
+            let mut n_samples = 0;
+            let ignore_initial_nls = input.prompt.to_text().ends_with('?');
+            let nl_token = context.llama_token_nl();
+
             // Generate remaining tokens.
-            let mut leftover_bytes: Vec<u8> = vec![];
             while n_remaining > 0 {
                 let tok = context.llama_sample(
                     context_size as i32,
                     embd.as_slice(),
                     n_used as i32,
                     &input,
+                    n_batch as i32,
                 );
+                n_samples += 1;
                 n_used += 1;
                 n_remaining -= 1;
                 embd[n_used] = tok;
+
                 if tok == token_eos {
                     break;
                 }
@@ -147,47 +159,43 @@ impl Executor {
                 {
                     break;
                 }
+
+                // If the input prompt is in the form of a question then next
+                // predicted tok will be a new line to finish off the question
+                // itself, followed by another new line before the actual
+                // answer. This is what the following is checking for.
+                if n_samples <= 2 && ignore_initial_nls && tok == nl_token {
+                    continue;
+                }
+
                 if tok == tokenized_stop_prompt[stop_sequence_i] {
                     stop_sequence_i += 1;
                     if stop_sequence_i >= tokenized_stop_prompt.len() {
                         break;
                     }
                 } else {
-                    let str_output =
-                        tokens_to_string(&context, &embd[n_used - stop_sequence_i..n_used]);
-                    // XXX: make into chat if chat
-                    must_send!(sender, StreamSegment::Content(str_output));
+                    let piece = bail!(
+                        context
+                            .llama_token_to_piece(tok)
+                            .map_err(|e| ExecutorError::InnerError(e.into())),
+                        sender
+                    );
+                    must_send!(sender, StreamSegment::Content(piece));
                     stop_sequence_i = 0;
-                }
-                bail!(
-                    context
-                        .llama_eval(&embd[n_used..], 1, n_used as i32, &input)
-                        .map_err(|e| ExecutorError::InnerError(e.into())),
-                    sender
-                );
 
-                if n_used >= tokenized_input.len() && stop_sequence_i == 0 {
-                    let bytes_output: Vec<u8> =
-                        [leftover_bytes, context.llama_token_to_bytes(&embd[n_used])].concat();
+                    let batch = LlamaBatch::new_with_token(tok, n_cur as i32);
 
-                    let (str_output, leftover) = decode_up_to_valid_utf8(&bytes_output);
-                    leftover_bytes = leftover;
-                    // XXX: make into chat if chat
-                    if sender.send(StreamSegment::Content(str_output)).is_err() {
-                        panic!("Failed to send");
-                    }
+                    n_batch = batch.token_count();
+                    n_cur += 1;
+
+                    bail!(
+                        context
+                            .llama_decode(&batch)
+                            .map_err(|e| ExecutorError::InnerError(e.into())),
+                        sender
+                    );
                 }
             }
-            if sender
-                .send(StreamSegment::Content(
-                    std::char::REPLACEMENT_CHARACTER
-                        .to_string()
-                        .repeat(leftover_bytes.len()),
-                ))
-                .is_err()
-            {
-                panic!("Failed to send");
-            }
         }); //JoinHandle is dropped? not sure how this works
 
         output
@@ -206,10 +214,11 @@ impl ExecutorTrait for Executor {
             .with_options(&opts_from_env)
             .with_options(&options);
 
-        let (model_path, context_params) = get_executor_initial_opts(&cas)?;
+        let (model_path, model_params, context_params) = get_executor_initial_opts(&cas)?;
         Ok(Self {
             context: Arc::new(Mutex::new(LLamaContext::from_file_and_params(
                 &model_path,
+                Some(&model_params),
                 Some(&context_params),
             )?)),
             options,
@@ -233,18 +242,18 @@ impl ExecutorTrait for Executor {
         let mut tokens_used = tokenizer
             .tokenize_str(&input)
             .map_err(|_e| PromptTokensError::UnableToCompute)?
-            .len() as i32;
+            .len();
         // includes answer_prefix
         let answer_prefix = self.answer_prefix(prompt);
         if let Some(prefix) = answer_prefix {
             let answer_used = tokenizer
                 .tokenize_str(&prefix)
                 .map_err(|_e| PromptTokensError::UnableToCompute)?
-                .len() as i32;
+                .len();
             tokens_used += answer_used
         }
         let max_tokens = self.max_tokens_allowed(options);
-        Ok(TokenCount::new(max_tokens, tokens_used))
+        Ok(TokenCount::new(max_tokens, tokens_used as i32))
     }
 
     fn answer_prefix(&self, prompt: &Prompt) -> Option<String> {
@@ -263,7 +272,7 @@ impl ExecutorTrait for Executor {
     }
 
     fn max_tokens_allowed(&self, _step: &Options) -> i32 {
-        self.context_params.n_ctx
+        self.context_params.n_ctx as i32
     }
 
     fn get_tokenizer(&self, _step: &Options) -> Result<LLamaTokenizer, TokenizerError> {
@@ -289,7 +298,7 @@ impl Tokenizer for LLamaTokenizer<'_> {
     fn tokenize_str(&self, doc: &str) -> Result<TokenCollection, TokenizerError> {
         let tokenized = tokio::task::block_in_place(|| {
             let context = self.context.blocking_lock();
-            tokenize(&context, doc, true)
+            tokenize(&context, doc, true, false)
         });
         Ok(tokenized.into())
     }
@@ -303,34 +312,3 @@ impl Tokenizer for LLamaTokenizer<'_> {
         Ok(output.to_string())
     }
 }
-
-fn decode_up_to_valid_utf8(bytes: &[u8]) -> (String, Vec<u8>) {
-    let (str_output, leftover): (String, Vec<u8>) = match std::str::from_utf8(bytes) {
-        Ok(s) => (s.to_owned(), Vec::new()),
-        Err(unicode_err) => {
-            let index = unicode_err.valid_up_to();
-            let good = &bytes[0..index];
-            match unicode_err.error_len() {
-                None => {
-                    let leftover = bytes[index..].to_vec();
-                    let out = std::str::from_utf8(good).unwrap().to_owned();
-                    (out, leftover)
-                }
-                Some(len) => {
-                    //let bad = &bytes[index..index+len];
-                    //eprintln!("bad utf8: {:?}", bad);
-                    let rest = &bytes[index + len..];
-                    let beggining = std::str::from_utf8(good).unwrap().to_owned();
-                    let (after, leftover) = decode_up_to_valid_utf8(rest);
-
-                    let mut out = beggining;
-                    out.push_str(&std::char::REPLACEMENT_CHARACTER.to_string().repeat(len));
-                    out.push_str(&after);
-
-                    (out, leftover)
-                }
-            }
-        }
-    };
-    (str_output, leftover)
-}
diff --git a/crates/llm-chain-llama/src/lib.rs b/crates/llm-chain-llama/src/lib.rs
index 48b39b19..16222a9d 100644
--- a/crates/llm-chain-llama/src/lib.rs
+++ b/crates/llm-chain-llama/src/lib.rs
@@ -21,13 +21,16 @@
 //!
 //! Happy coding, and enjoy the amazing world of LLMs with llm-chain-llama! 🥳🚀
 
+mod batch;
 mod context;
 mod executor;
+mod model;
 mod options;
 mod tokenizer;
 
 pub use context::ContextParams;
 pub use executor::Executor;
+pub use model::ModelParams;
 
 #[deprecated(note = "Use llm_chain::step::Step instead", since = "0.7.0")]
 pub use llm_chain::step::Step;
diff --git a/crates/llm-chain-llama/src/model.rs b/crates/llm-chain-llama/src/model.rs
new file mode 100644
index 00000000..9a6aba8f
--- /dev/null
+++ b/crates/llm-chain-llama/src/model.rs
@@ -0,0 +1,69 @@
+use llm_chain_llama_sys::{llama_model_default_params, llama_model_params, LLAMA_MAX_DEVICES};
+use std::ptr::null_mut;
+
+// Represents the configuration parameters for a LLama model.
+#[derive(Debug, Clone)]
+pub struct ModelParams {
+    pub n_gpu_layers: i32,
+    pub main_gpu: i32,
+    pub tensor_split: Vec<f32>,
+    pub vocab_only: bool,
+    pub use_mmap: bool,
+    pub use_mlock: bool,
+}
+
+impl ModelParams {
+    pub fn new() -> ModelParams {
+        unsafe { llama_model_default_params() }.into()
+    }
+    // Returns the default parameters or the user-specified parameters.
+    pub(crate) fn or_default(params: Option<&ModelParams>) -> llama_model_params {
+        match params {
+            Some(params) => params.clone().into(),
+            None => unsafe { llama_model_default_params() },
+        }
+    }
+}
+
+impl Default for ModelParams {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl From<ModelParams> for llama_model_params {
+    fn from(params: ModelParams) -> Self {
+        llama_model_params {
+            n_gpu_layers: params.n_gpu_layers,
+            main_gpu: params.main_gpu,
+            tensor_split: params.tensor_split.as_ptr() as *const f32,
+            vocab_only: params.vocab_only,
+            use_mmap: params.use_mmap,
+            use_mlock: params.use_mlock,
+            progress_callback: None,
+            progress_callback_user_data: null_mut(),
+        }
+    }
+}
+
+impl From<llama_model_params> for ModelParams {
+    fn from(params: llama_model_params) -> Self {
+        let tensor_split = unsafe {
+            if params.tensor_split.is_null() {
+                Vec::new()
+            } else {
+                let slice =
+                    std::slice::from_raw_parts(params.tensor_split, LLAMA_MAX_DEVICES as usize);
+                slice.to_vec()
+            }
+        };
+        ModelParams {
+            n_gpu_layers: params.n_gpu_layers,
+            main_gpu: params.main_gpu,
+            tensor_split,
+            vocab_only: params.vocab_only,
+            use_mmap: params.use_mmap,
+            use_mlock: params.use_mlock,
+        }
+    }
+}
diff --git a/crates/llm-chain-llama/src/options.rs b/crates/llm-chain-llama/src/options.rs
index 37b62318..e6bd8750 100644
--- a/crates/llm-chain-llama/src/options.rs
+++ b/crates/llm-chain-llama/src/options.rs
@@ -9,10 +9,10 @@ use llm_chain::{
 use std::collections::HashMap;
 
 use crate::context::ContextParams;
+use crate::model::ModelParams;
 
 /// Represents a concrete call to the LLM model, with all the parameters specified, and no implicit behavior.
 pub struct LlamaInvocation {
-    pub(crate) n_threads: i32,
     pub(crate) n_tok_predict: usize,
     pub(crate) logit_bias: HashMap<i32, f32>,
     pub(crate) top_k: i32,
@@ -49,7 +49,6 @@ impl LlamaInvocation {
         opt: OptionsCascade,
         prompt: &Prompt,
     ) -> Result<LlamaInvocation, ExecutorCreationError> {
-        let n_threads = opt_extract!(opt, n_threads, NThreads)?;
         let n_tok_predict = opt_extract!(opt, n_tok_predict, MaxTokens)?;
         let top_k = opt_extract!(opt, top_k, TopK)?;
         let top_p = opt_extract!(opt, top_p, TopP)?;
@@ -70,7 +69,6 @@ impl LlamaInvocation {
         let logit_bias = HashMap::<i32, f32>::new(); // token_bias.as_i32_f32_hashmap()?;
 
         Ok(LlamaInvocation {
-            n_threads: *n_threads as i32,
             n_tok_predict: *n_tok_predict,
             logit_bias,
             top_k: *top_k,
@@ -97,6 +95,7 @@ lazy_static! {
         // ModelType: "llama", // not used
         NThreads: 1_usize,
         MaxTokens: 0_usize,
+        MaxBatchSize: 512_usize,
         MaxContextSize: 2048_usize,
         TopK: 40_i32,
         TopP: 0.95,
@@ -111,18 +110,58 @@ lazy_static! {
         MirostatTau: 5.0,
         MirostatEta: 0.1,
         PenalizeNl: true,
-        StopSequence: vec!["\n\n".to_string()]
+        StopSequence: vec!["\n\n".to_string()],
+        NGpuLayers: 0_i32,
+        MainGpu: 0_i32,
+        TensorSplit: Vec::new(),
+        VocabOnly: false,
+        UseMmap: true,
+        UseMlock: false
     );
 }
 
 pub(crate) fn get_executor_initial_opts(
     opt: &OptionsCascade,
-) -> Result<(String, ContextParams), ExecutorCreationError> {
+) -> Result<(String, ModelParams, ContextParams), ExecutorCreationError> {
     let model = opt_extract!(opt, model, Model)?;
-    let max_context_size = opt_extract!(opt, max_context_size, MaxContextSize)?;
+
+    let mut mp = ModelParams::new();
+    if let Some(Opt::NGpuLayers(value)) = opt.get(OptDiscriminants::NGpuLayers) {
+        mp.n_gpu_layers = *value;
+    }
+    if let Some(Opt::MainGpu(value)) = opt.get(OptDiscriminants::MainGpu) {
+        mp.main_gpu = *value;
+    }
+    if let Some(Opt::TensorSplit(values)) = opt.get(OptDiscriminants::TensorSplit) {
+        mp.tensor_split = values.clone();
+    }
+    // Currently, the setting of vocab_only is not allowed as it will cause
+    // a crash when using the llama executor which needs to have wieghts loaded
+    // in order to work.
+    mp.vocab_only = false;
+
+    if let Some(Opt::UseMmap(value)) = opt.get(OptDiscriminants::UseMmap) {
+        mp.use_mmap = *value;
+    }
+    if let Some(Opt::UseMlock(value)) = opt.get(OptDiscriminants::UseMlock) {
+        mp.use_mlock = *value;
+    }
 
     let mut cp = ContextParams::new();
-    cp.n_ctx = *max_context_size as i32;
+    if let Some(Opt::NThreads(value)) = opt.get(OptDiscriminants::NThreads) {
+        cp.n_threads = *value as u32;
+    }
+
+    let max_context_size = opt_extract!(opt, max_context_size, MaxContextSize)?;
+    cp.n_ctx = *max_context_size as u32;
+
+    let n_batch = opt_extract!(opt, nbatch, MaxBatchSize)?;
+    cp.n_batch = *n_batch as u32;
+    if max_context_size < n_batch {
+        return Err(ExecutorCreationError::InvalidValue(
+            "MaxBatchSize must be less than or equal to MaxContextSize".to_string(),
+        ));
+    }
 
-    Ok((model.to_path(), cp))
+    Ok((model.to_path(), mp, cp))
 }
diff --git a/crates/llm-chain-llama/src/tokenizer.rs b/crates/llm-chain-llama/src/tokenizer.rs
index 6e12ce97..373ade31 100644
--- a/crates/llm-chain-llama/src/tokenizer.rs
+++ b/crates/llm-chain-llama/src/tokenizer.rs
@@ -3,9 +3,7 @@ use llm_chain::prompt::Data;
 use std::ffi::{CStr, CString};
 use std::os::raw::c_char;
 
-use llm_chain_llama_sys::{
-    llama_token, llama_token_eos as inner_eos, llama_token_to_str, llama_tokenize,
-};
+use llm_chain_llama_sys::{llama_token, llama_token_get_text, llama_tokenize};
 
 use crate::context::LLamaContext;
 
@@ -25,17 +23,13 @@ fn to_cstring(s: &str) -> CString {
 ///
 /// A Rust String representation of the given llama_token.
 fn to_output(context: &LLamaContext, token: i32) -> String {
-    let c_ptr = unsafe { llama_token_to_str(**context, token) };
+    let c_ptr = unsafe { llama_token_get_text(context.model, token) };
     let native_string = unsafe { CStr::from_ptr(c_ptr) }
         .to_string_lossy()
         .into_owned();
     native_string
 }
 
-pub fn llama_token_eos() -> i32 {
-    unsafe { inner_eos() }
-}
-
 /// Helper function to tokenize text using the provided LLamaContext and add_bos option.
 ///
 /// # Arguments
@@ -47,21 +41,42 @@ pub fn llama_token_eos() -> i32 {
 /// # Returns
 ///
 /// A Vec of llama_tokens representing the tokenized input.
-pub(crate) fn tokenize(context: &LLamaContext, text: &str, add_bos: bool) -> Vec<llama_token> {
+pub(crate) fn tokenize(
+    context: &LLamaContext,
+    text: &str,
+    add_bos: bool,
+    special: bool,
+) -> Vec<llama_token> {
     let mut res = Vec::with_capacity(text.len() + add_bos as usize);
     let c_text = to_cstring(text);
-
-    let n = unsafe {
+    let n_tokens = unsafe {
         llama_tokenize(
-            **context,
+            context.model,
             c_text.as_ptr() as *const c_char,
+            c_text.to_bytes().len() as i32,
             res.as_mut_ptr(),
             res.capacity() as i32,
             add_bos,
+            special,
         )
     };
-    assert!(n >= 0);
-    unsafe { res.set_len(n as usize) };
+    if n_tokens < 0 {
+        res.resize(-n_tokens as usize, 0);
+        let new_n_tokens = unsafe {
+            llama_tokenize(
+                context.model,
+                c_text.as_ptr() as *const c_char,
+                c_text.to_bytes().len() as i32,
+                res.as_mut_ptr(),
+                res.capacity() as i32,
+                add_bos,
+                special,
+            )
+        };
+        assert!(new_n_tokens == -n_tokens);
+    } else {
+        unsafe { res.set_len(n_tokens as usize) };
+    }
     res
 }
 
diff --git a/crates/llm-chain/src/options.rs b/crates/llm-chain/src/options.rs
index ee180638..bdef2cb4 100644
--- a/crates/llm-chain/src/options.rs
+++ b/crates/llm-chain/src/options.rs
@@ -341,6 +341,9 @@ pub enum Opt {
     MaxTokens(usize),
     /// The maximum context size of the model.
     MaxContextSize(usize),
+    /// The maximum batch size of the model.
+    /// This is used by llama models.
+    MaxBatchSize(usize),
     /// The sequences that, when encountered, will cause the model to stop generating further tokens.
     /// OpenAI models allow up to four stop sequences.
     StopSequence(Vec<String>),
@@ -394,6 +397,19 @@ pub enum Opt {
     User(String),
     /// The type of the model.
     ModelType(String),
+
+    // The number of layers to be stored in GPU VRAM for llm-chain-llama.
+    NGpuLayers(i32),
+    // The GPU that should be used for scratch and small tensors for llm-chain-llama.
+    MainGpu(i32),
+    // How the layers should be split accross the available GPUs for llm-chain-llama.
+    TensorSplit(Vec<f32>),
+    // Only load the vocabulary for llm-chain-llama, no weights will be loaded.
+    VocabOnly(bool),
+    // Use memory mapped files for llm-chain-llama where possible.
+    UseMmap(bool),
+    // Force the system to keep the model in memory for llm-chain-llama.
+    UseMlock(bool),
 }
 
 // Helper function to extract environment variables
diff --git a/crates/llm-chain/src/traits.rs b/crates/llm-chain/src/traits.rs
index b98b5c36..65bc4284 100644
--- a/crates/llm-chain/src/traits.rs
+++ b/crates/llm-chain/src/traits.rs
@@ -27,6 +27,8 @@ pub enum ExecutorCreationError {
     InnerError(#[from] Box<dyn Error + Send + Sync>),
     #[error("Field must be set: {0}")]
     FieldRequiredError(String),
+    #[error("Invalid value. {0}")]
+    InvalidValue(String),
 }
 
 #[derive(thiserror::Error, Debug)]