refactor: Refactor JIT and AOT build script (#567)

Previously, JIT and AOT packaging is a bit broken. This PR produces good sdist for JIT mode, and wheel for AOT mode. ## Changes Common changes: 1. Remove the symlinks. Symlinks causes lots of duplication when search in VSCode. 2. In package distribution (sdist or wheel), add data files to `python/flashinfer/data/`, i.e. inside the python package folder. This is strongly recommended by setuptools. * Data files include: `version.txt`, FlashInfer headers, Cutlass headers. * Symlinks will be created when building wheel, and will be removed when finished unless it's using `develop` command. 3. Exclude unneeded cutlass docs and files from wheel and sdist. AOT changes: 1. Remove `flashinfer-aot` dir. Contents are moved to `python/`. 2. Merge all kernels into one pybind. This is good for compilation speed. (`_kernels_sm90` is preserved as a separated `.so` file.) 3. AOT wheel can now be built with the following command: ```bash cd flashinfer/python TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" python3 aot_setup.py bdist_wheel ls -la dist/ ``` 4. AOT wheel can also be built for editable install (develop purpose) ```bash cd flashinfer/python TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" python3 aot_setup.py develop ``` JIT changes: 1. JIT mode can now be installed in various ways: ```bash cd flashinfer/python pip install -v . # Regular install from source pip install -v -e . # Editable install python -m build --sdist # Build sdist pip install dist/flashinfer-*.tar.gz # Install from sdist ``` ## Directory structure of built package See attached. [dir-wheel.txt](https://github.com/user-attachments/files/17562193/dir-wheel.txt) [dir-sdist.txt](https://github.com/user-attachments/files/17562194/dir-sdist.txt) ## Tests I was able to pass `pytest -sv test_norm.py test_bmm_fp8.py` using various way of installation: 1. Editable install 2. Regular install from source 3. Install from sdist 4. Install from wheel
flashinfer-ai · Oct 30, 2024 · 7df90dd · 7df90dd
1 parent e46d9a7
commit 7df90dd
Show file tree

Hide file tree

Showing 40 changed files with 425 additions and 344 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,7 +13,12 @@ src/generated/
 python/csrc/generated/
 python/flashinfer/_build_meta.py
 python/flashinfer/jit/aot_config.py
-flashinfer-aot/csrc_aot/generated/
+python/csrc_aot/generated/
+
+# Package files
+python/flashinfer/data/
+python/flashinfer/version.txt
+python/MANIFEST.in
 
 # Generated documentation files
 docs/generated

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -138,7 +138,7 @@ You can follow the steps below to install FlashInfer from source code:
     
        pip install ninja
 
-4. Compile FlashInfer:
+4. Install FlashInfer:
 
    .. tabs::
 
@@ -153,8 +153,17 @@ You can follow the steps below to install FlashInfer from source code:
 
            .. code-block:: bash
 
-               cd flashinfer/flashinfer-aot
-               pip install -e . -v
+               cd flashinfer/python
+               TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" python3 aot_setup.py bdist_wheel
+               pip install dist/flashinfer-*.whl
+
+       .. tab:: Create sdist for JIT mode
+
+           .. code-block:: bash
+
+               cd flashinfer/python
+               python -m build --sdist
+               ls -la dist/
 
 C++ API
 -------

diff --git a/flashinfer-aot/3rdparty b/flashinfer-aot/3rdparty
diff --git a/flashinfer-aot/MANIFEST.in b/flashinfer-aot/MANIFEST.in
diff --git a/flashinfer-aot/csrc b/flashinfer-aot/csrc
diff --git a/flashinfer-aot/csrc_aot/flashinfer_ops_decode.cu b/flashinfer-aot/csrc_aot/flashinfer_ops_decode.cu
diff --git a/flashinfer-aot/csrc_aot/flashinfer_ops_prefill.cu b/flashinfer-aot/csrc_aot/flashinfer_ops_prefill.cu
diff --git a/flashinfer-aot/flashinfer b/flashinfer-aot/flashinfer
diff --git a/flashinfer-aot/include b/flashinfer-aot/include
diff --git a/flashinfer-aot/version.txt b/flashinfer-aot/version.txt
diff --git a/include/flashinfer/attention/scheduler.cuh b/include/flashinfer/attention/scheduler.cuh
@@ -50,7 +50,7 @@ __global__ void BatchDecodeWithPagedKVCacheKernel(const __grid_constant__
  *   the new batch size after the partition.
  */
 template <typename IdType>
-auto PartitionPagedKVCacheBinarySearchMinNumPagePerBatch(
+inline auto PartitionPagedKVCacheBinarySearchMinNumPagePerBatch(
     const uint32_t max_grid_size, const uint32_t num_kv_heads, const std::vector<IdType>& num_pages,
     const uint32_t min_num_pages_per_batch = 1) {
   uint32_t low = min_num_pages_per_batch, high = 0;
@@ -77,7 +77,7 @@ auto PartitionPagedKVCacheBinarySearchMinNumPagePerBatch(
   return std::make_tuple(low, new_batch_size);
 }
 
-auto PrefillBinarySearchKVChunkSize(const uint32_t max_batch_size_if_split,
+inline auto PrefillBinarySearchKVChunkSize(const uint32_t max_batch_size_if_split,
                                     const std::vector<int64_t>& packed_qo_len_arr,
                                     const std::vector<int64_t>& kv_len_arr,
                                     const uint32_t qo_chunk_size,
@@ -129,7 +129,7 @@ auto PrefillBinarySearchKVChunkSize(const uint32_t max_batch_size_if_split,
  */
 template <uint32_t GROUP_SIZE, uint32_t HEAD_DIM, PosEncodingMode POS_ENCODING_MODE,
           typename AttentionVariant>
-cudaError_t BatchDecodeWithPagedKVCacheWorkEstimationDispatched(
+inline cudaError_t BatchDecodeWithPagedKVCacheWorkEstimationDispatched(
     bool& split_kv, uint32_t& max_grid_size, uint32_t& max_num_pages_per_batch,
     uint32_t& new_batch_size, uint32_t batch_size, typename AttentionVariant::IdType* kv_indptr_h,
     const uint32_t num_qo_heads, const uint32_t page_size, bool enable_cuda_graph,
@@ -201,7 +201,7 @@ cudaError_t BatchDecodeWithPagedKVCacheWorkEstimationDispatched(
  * \return status Indicates whether CUDA calls are successful
  */
 template <typename IdType>
-auto DecodeSplitKVIndptr(IdType* indptr_h, uint32_t batch_size, uint32_t kv_chunk_size) {
+inline auto DecodeSplitKVIndptr(IdType* indptr_h, uint32_t batch_size, uint32_t kv_chunk_size) {
   std::vector<IdType> request_indices, kv_tile_indices, o_indptr;
   o_indptr.push_back(0);
 
@@ -277,7 +277,7 @@ struct DecodePlanInfo {
 };
 
 template <uint32_t HEAD_DIM, PosEncodingMode POS_ENCODING_MODE, typename AttentionVariant>
-cudaError_t DecodePlan(void* float_buffer, size_t float_workspace_size_in_bytes, void* int_buffer,
+inline cudaError_t DecodePlan(void* float_buffer, size_t float_workspace_size_in_bytes, void* int_buffer,
                        void* page_locked_int_buffer, size_t int_workspace_size_in_bytes,
                        DecodePlanInfo& plan_info, typename AttentionVariant::IdType* indptr_h,
                        uint32_t batch_size, uint32_t num_qo_heads, uint32_t num_kv_heads,
@@ -350,7 +350,7 @@ cudaError_t DecodePlan(void* float_buffer, size_t float_workspace_size_in_bytes,
 }
 
 template <typename IdType>
-auto PrefillSplitQOKVIndptr(IdType* qo_indptr_h, IdType* kv_indptr_h, uint32_t batch_size,
+inline auto PrefillSplitQOKVIndptr(IdType* qo_indptr_h, IdType* kv_indptr_h, uint32_t batch_size,
                             uint32_t num_qo_heads, uint32_t num_kv_heads, uint32_t head_dim,
                             uint32_t page_size, uint32_t max_batch_size_if_split,
                             bool enable_cuda_graph) {
@@ -520,7 +520,7 @@ struct PrefillPlanInfo {
 };
 
 template <typename IdType>
-cudaError_t PrefillPlan(void* float_buffer, size_t float_workspace_size_in_bytes, void* int_buffer,
+inline cudaError_t PrefillPlan(void* float_buffer, size_t float_workspace_size_in_bytes, void* int_buffer,
                         void* page_locked_int_buffer, size_t int_workspace_size_in_bytes,
                         PrefillPlanInfo& plan_info, IdType* qo_indptr_h, IdType* kv_indptr_h,
                         uint32_t batch_size, uint32_t num_qo_heads, uint32_t num_kv_heads,

diff --git a/python/3rdparty b/python/3rdparty
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
diff --git a/python/_aot_build_utils/__init__.py b/python/_aot_build_utils/__init__.py
diff --git a/...r-aot/generate_batch_paged_decode_inst.py → ...utils/generate_batch_paged_decode_inst.py b/...r-aot/generate_batch_paged_decode_inst.py → ...utils/generate_batch_paged_decode_inst.py
@@ -14,14 +14,15 @@
 limitations under the License.
 """
 
-import sys
 import re
-from literal_map import (
-    pos_encoding_mode_literal,
+import sys
+from pathlib import Path
+
+from .literal_map import (
     dtype_literal,
     idtype_literal,
+    pos_encoding_mode_literal,
 )
-from pathlib import Path
 
 
 def get_cu_file_str(

diff --git a/...-aot/generate_batch_paged_prefill_inst.py → ...tils/generate_batch_paged_prefill_inst.py b/...-aot/generate_batch_paged_prefill_inst.py → ...tils/generate_batch_paged_prefill_inst.py
@@ -14,16 +14,16 @@
 limitations under the License.
 """
 
-import sys
 import re
-import itertools
-from literal_map import (
-    mask_mode_literal,
-    pos_encoding_mode_literal,
+import sys
+from pathlib import Path
+
+from .literal_map import (
     dtype_literal,
     idtype_literal,
+    mask_mode_literal,
+    pos_encoding_mode_literal,
 )
-from pathlib import Path
 
 
 def get_cu_file_str(

diff --git a/...aot/generate_batch_ragged_prefill_inst.py → ...ils/generate_batch_ragged_prefill_inst.py b/...aot/generate_batch_ragged_prefill_inst.py → ...ils/generate_batch_ragged_prefill_inst.py
@@ -14,15 +14,16 @@
 limitations under the License.
 """
 
-import sys
 import re
-from literal_map import (
-    mask_mode_literal,
-    pos_encoding_mode_literal,
+import sys
+from pathlib import Path
+
+from .literal_map import (
     dtype_literal,
     idtype_literal,
+    mask_mode_literal,
+    pos_encoding_mode_literal,
 )
-from pathlib import Path
 
 
 def get_cu_file_str(

diff --git a/flashinfer-aot/generate_dispatch_inc.py → ..._aot_build_utils/generate_dispatch_inc.py b/flashinfer-aot/generate_dispatch_inc.py → ..._aot_build_utils/generate_dispatch_inc.py
@@ -16,10 +16,11 @@
 
 import argparse
 from pathlib import Path
-from literal_map import (
-    pos_encoding_mode_literal,
+
+from .literal_map import (
     bool_literal,
     mask_mode_literal,
+    pos_encoding_mode_literal,
 )
 
 

diff --git a/...hinfer-aot/generate_single_decode_inst.py → ...uild_utils/generate_single_decode_inst.py b/...hinfer-aot/generate_single_decode_inst.py → ...uild_utils/generate_single_decode_inst.py
@@ -14,13 +14,14 @@
 limitations under the License.
 """
 
-import sys
 import re
-from literal_map import (
-    pos_encoding_mode_literal,
+import sys
+from pathlib import Path
+
+from .literal_map import (
     dtype_literal,
+    pos_encoding_mode_literal,
 )
-from pathlib import Path
 
 
 def get_cu_file_str(

diff --git a/...infer-aot/generate_single_prefill_inst.py → ...ild_utils/generate_single_prefill_inst.py b/...infer-aot/generate_single_prefill_inst.py → ...ild_utils/generate_single_prefill_inst.py
@@ -14,14 +14,15 @@
 limitations under the License.
 """
 
-import sys
 import re
-from literal_map import (
-    pos_encoding_mode_literal,
+import sys
+from pathlib import Path
+
+from .literal_map import (
     dtype_literal,
     mask_mode_literal,
+    pos_encoding_mode_literal,
 )
-from pathlib import Path
 
 
 def get_cu_file_str(

diff --git a/flashinfer-aot/literal_map.py → python/_aot_build_utils/literal_map.py b/flashinfer-aot/literal_map.py → python/_aot_build_utils/literal_map.py
diff --git a/python/aot_MANIFEST.in b/python/aot_MANIFEST.in
@@ -0,0 +1,13 @@
+# MANIFEST.in for AOT wheel
+
+prune */__pycache__
+prune csrc
+prune csrc_aot
+exclude aot_setup.py
+exclude setup.py
+
+include flashinfer/data/version.txt
+graft flashinfer/data/csrc
+graft flashinfer/data/include
+graft flashinfer/data/cutlass/include
+graft flashinfer/data/cutlass/tools/util/include