diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d5548030e..4073f72c4 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -30,6 +30,7 @@ enum Quantization { Hqq_4bit, Hqq_3bit, Hqq_2bit, + Fp8, } impl std::fmt::Display for Quantization { @@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization { Quantization::Hqq_2bit => { write!(f, "hqq-2bit") } + Quantization::Fp8 => { + write!(f, "fp8") + } } } } diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2 index 36ef576ae..bbff00909 100644 --- a/server/Makefile-flash-att-v2 +++ b/server/Makefile-flash-att-v2 @@ -1,11 +1,11 @@ -flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9 +flash_att_v2_commit_cuda := v2.5.8 flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6 flash-attention-v2-cuda: # Clone flash attention pip install -U packaging ninja --no-cache-dir - git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2 + git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2 build-flash-attention-v2-cuda: flash-attention-v2-cuda cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda) diff --git a/server/lorax_server/cli.py b/server/lorax_server/cli.py index 5fe7532d1..f323a153e 100644 --- a/server/lorax_server/cli.py +++ b/server/lorax_server/cli.py @@ -22,6 +22,7 @@ class Quantization(str, Enum): hqq_4bit = "hqq-4bit" hqq_3bit = "hqq-3bit" hqq_2bit = "hqq-2bit" + fp8 = "fp8" class Dtype(str, Enum):