Skip to content

Commit

Permalink
add fp8, update FA
Browse files Browse the repository at this point in the history
  • Loading branch information
flozi00 committed May 22, 2024
1 parent f207d40 commit 9649fad
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 2 deletions.
4 changes: 4 additions & 0 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ enum Quantization {
Hqq_4bit,
Hqq_3bit,
Hqq_2bit,
Fp8,
}

impl std::fmt::Display for Quantization {
Expand Down Expand Up @@ -63,6 +64,9 @@ impl std::fmt::Display for Quantization {
Quantization::Hqq_2bit => {
write!(f, "hqq-2bit")
}
Quantization::Fp8 => {
write!(f, "fp8")
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions server/Makefile-flash-att-v2
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9
flash_att_v2_commit_cuda := v2.5.8
flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6


flash-attention-v2-cuda:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2
git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2

build-flash-attention-v2-cuda: flash-attention-v2-cuda
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)
Expand Down
1 change: 1 addition & 0 deletions server/lorax_server/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class Quantization(str, Enum):
hqq_4bit = "hqq-4bit"
hqq_3bit = "hqq-3bit"
hqq_2bit = "hqq-2bit"
fp8 = "fp8"


class Dtype(str, Enum):
Expand Down

0 comments on commit 9649fad

Please sign in to comment.