Update web-rwkv to v0.9.3

Ai00-X · Dec 6, 2024 · 3342c6f · 3342c6f
1 parent 2660177
commit 3342c6f
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 14 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,7 +35,7 @@ path = "crates/ai00-core"
 # path = "../web-rwkv"
 default-features = false
 features = ["native"]
-version = "0.9.0"
+version = "0.9.3"
 
 [patch.crates-io]
 hf-hub = { git = "https://github.com/cgisky1980/hf-hub.git", branch = "main" }

diff --git a/assets/configs/Config.toml b/assets/configs/Config.toml
@@ -1,13 +1,13 @@
 [model]
-embed_device = "Cpu"                                 # Device to put the embed tensor ("Cpu" or "Gpu").
-max_batch = 8                                        # The maximum batches that are cached on GPU.
-name = "RWKV-x060-World-3B-v2.1-20240417-ctx4096.st" # Name of the model.
-path = "assets/models"                               # Path to the folder containing all models.
-precision = "Fp16"                                   # Precision for intermediate tensors ("Fp16" or "Fp32"). "Fp32" yields better outputs but slower.
-quant = 16                                           # Layers to be quantized.
-quant_type = "Int8"                                  # Quantization type ("Int8" or "NF4").
-stop = ["\n\n"]                                      # Additional stop words in generation.
-token_chunk_size = 128                               # Size of token chunk that is inferred at once. For high end GPUs, this could be 64 or 128 (faster).
+embed_device = "Cpu"                               # Device to put the embed tensor ("Cpu" or "Gpu").
+max_batch = 8                                      # The maximum batches that are cached on GPU.
+name = "RWKV-x060-World-7B-v3-20241112-ctx4096.st" # Name of the model.
+path = "assets/models"                             # Path to the folder containing all models.
+precision = "Fp16"                                 # Precision for intermediate tensors ("Fp16" or "Fp32"). "Fp32" yields better outputs but slower.
+quant = 0                                          # Layers to be quantized.
+quant_type = "Int8"                                # Quantization type ("Int8" or "NF4").
+stop = ["\n\n"]                                    # Additional stop words in generation.
+token_chunk_size = 128                             # Size of token chunk that is inferred at once. For high end GPUs, this could be 64 or 128 (faster).
 
 # [[state]] # State-tuned initial state.
 # id = "fd7a60ed-7807-449f-8256-bccae3246222"                      # UUID for this state, which is used to specify which one to use in the APIs.