ruff

predibase · Oct 18, 2024 · 1d9f5d4 · 1d9f5d4
1 parent 401f1ae
commit 1d9f5d4
Show file tree

Hide file tree

Showing 27 changed files with 119 additions and 217 deletions.
diff --git a/server/lorax_server/models/custom_modeling/flash_cohere_modeling.py b/server/lorax_server/models/custom_modeling/flash_cohere_modeling.py
@@ -20,8 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 import dropout_layer_norm
 import rotary_emb
 import torch
@@ -32,6 +30,7 @@
 from lorax_server.adapters.weights import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     MultiAdapterHead,

diff --git a/server/lorax_server/models/custom_modeling/flash_dbrx_modeling.py b/server/lorax_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -15,7 +15,6 @@
 
 from typing import Any, List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import numpy as np
 import torch
 import torch.distributed
@@ -27,6 +26,7 @@
 from lorax_server.adapters.weights import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     FastLinear,

diff --git a/server/lorax_server/models/custom_modeling/flash_gemma2_modeling.py b/server/lorax_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -20,7 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
@@ -40,6 +39,7 @@
 from lorax_server.layers.rotary import PositionRotaryEmbedding
 from lorax_server.layers.tensor_parallel import TensorParallelHead
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import MultiAdapterHead, TensorParallelAdapterRowLinear, TensorParallelMultiAdapterLinear
 from lorax_server.utils.lora import (
     DOWN_PROJ,

diff --git a/server/lorax_server/models/custom_modeling/flash_gemma_modeling.py b/server/lorax_server/models/custom_modeling/flash_gemma_modeling.py
@@ -15,7 +15,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
@@ -26,6 +25,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     PositionRotaryEmbedding,
     TensorParallelAdapterRowLinear,

diff --git a/server/lorax_server/models/custom_modeling/flash_gpt2_modeling.py b/server/lorax_server/models/custom_modeling/flash_gpt2_modeling.py
@@ -20,7 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
@@ -31,6 +30,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     TensorParallelAdapterRowLinear,

diff --git a/server/lorax_server/models/custom_modeling/flash_llama_modeling.py b/server/lorax_server/models/custom_modeling/flash_llama_modeling.py
@@ -20,8 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 # Flash attention imports
 import dropout_layer_norm
 import torch
@@ -32,6 +30,7 @@
 
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     MultiAdapterHead,
     PositionRotaryEmbedding,

diff --git a/server/lorax_server/models/custom_modeling/flash_mistral_modeling.py b/server/lorax_server/models/custom_modeling/flash_mistral_modeling.py
@@ -20,8 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 # Flash attention imports
 import dropout_layer_norm
 import torch
@@ -32,6 +30,7 @@
 
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.flash_attn import HAS_FLASH_ATTN_V2_CUDA
 from lorax_server.utils.layers import (
     MultiAdapterHead,

diff --git a/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py b/server/lorax_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -21,8 +21,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 # Flash attention imports
 import dropout_layer_norm
 import numpy as np
@@ -35,6 +33,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.flash_attn import HAS_FLASH_ATTN_V2_CUDA
 from lorax_server.utils.layers import (
     FastLinear,
@@ -189,13 +188,15 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
         ], f"{list(weight.shape)} != {[(num_heads + 2 * num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(
-        weight, 
-        bias=None, 
-        quantize=config.quantize, 
-        weight_scale=weight_scale,
-        input_scale=input_scale,
-    ))
+    return TensorParallelColumnLinear(
+        get_linear(
+            weight,
+            bias=None,
+            quantize=config.quantize,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+        )
+    )
 
 
 def _load_experts(config, prefix, mat, weights):

diff --git a/server/lorax_server/models/custom_modeling/flash_neox_modeling.py b/server/lorax_server/models/custom_modeling/flash_neox_modeling.py
@@ -20,7 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
@@ -30,6 +29,7 @@
 
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     PositionRotaryEmbedding,

diff --git a/server/lorax_server/models/custom_modeling/flash_phi3_modeling.py b/server/lorax_server/models/custom_modeling/flash_phi3_modeling.py
@@ -20,8 +20,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 # Flash attention imports
 import dropout_layer_norm
 import torch
@@ -33,6 +31,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     MultiAdapterHead,
     PositionRotaryEmbedding,

diff --git a/server/lorax_server/models/custom_modeling/flash_phi_modeling.py b/server/lorax_server/models/custom_modeling/flash_phi_modeling.py
@@ -11,7 +11,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
@@ -20,6 +19,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     MultiAdapterHead,

diff --git a/server/lorax_server/models/custom_modeling/flash_qwen2_modeling.py b/server/lorax_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -6,8 +6,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 # Flash attention imports
 import dropout_layer_norm
 import torch
@@ -18,6 +16,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     MultiAdapterHead,
     PositionRotaryEmbedding,

diff --git a/server/lorax_server/models/custom_modeling/flash_qwen_modeling.py b/server/lorax_server/models/custom_modeling/flash_qwen_modeling.py
@@ -6,8 +6,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 # Flash attention imports
 import dropout_layer_norm
 import torch
@@ -19,6 +17,7 @@
 from lorax_server.adapters import AdapterBatchData
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     MultiAdapterHead,
     PositionRotaryEmbedding,

diff --git a/server/lorax_server/models/custom_modeling/flash_rw_modeling.py b/server/lorax_server/models/custom_modeling/flash_rw_modeling.py
@@ -1,6 +1,5 @@
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
@@ -9,6 +8,7 @@
 
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     PositionRotaryEmbedding,

diff --git a/server/lorax_server/models/custom_modeling/flash_santacoder_modeling.py b/server/lorax_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -1,13 +1,13 @@
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 
 from lorax_server.models.custom_modeling.utils import prepend
 from lorax_server.utils import flash_attn, paged_attention
+from lorax_server.utils.attention.common import Seqlen
 from lorax_server.utils.layers import (
     FastLayerNorm,
     TensorParallelColumnLinear,

diff --git a/server/lorax_server/models/custom_modeling/llava_next.py b/server/lorax_server/models/custom_modeling/llava_next.py
@@ -16,7 +16,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
 import torch
 import torch.utils.checkpoint
 from torch import nn
@@ -32,6 +31,7 @@
     load_text_model,
     load_vision_model,
 )
+from lorax_server.utils.attention.common import Seqlen
 
 
 def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):

diff --git a/server/lorax_server/models/custom_modeling/mllama.py b/server/lorax_server/models/custom_modeling/mllama.py
@@ -16,8 +16,6 @@
 
 from typing import List, Optional, Tuple
 
-from lorax_server.utils.attention.common import Seqlen
-
 import flash_attn_2_cuda
 import torch
 import torch.nn.functional as F
@@ -36,6 +34,7 @@
     FlashLlamaForCausalLM,
     FlashLlamaLayer,
 )
+from lorax_server.utils.attention.common import Seqlen
 
 
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb