Refactor turbomind attention by precomputing rotary embed #1111

	name: linux-x64-gpu
	on:
	push:
	paths:
	- '.github/workflows/linux-x64-gpu.yml'
	- 'src/**'
	- 'CMakeLists.txt'
	- 'cmake/**'
	- 'examples/**'
	- '3rdparty/**'
	- 'tests/csrc/**'
	pull_request:
	paths:
	- '.github/workflows/linux-x64-gpu.yml'
	- 'src/**'
	- 'CMakeLists.txt'
	- 'cmake/**'
	- 'examples/**'
	- '3rdparty/**'
	- 'tests/csrc/**'
	concurrency:
	group: linux-x64-gpu-${{ github.ref }}
	cancel-in-progress: true
	permissions:
	contents: read

	jobs:
	build:
	strategy:
	matrix:
	cudaver: [11.8, 12.1]
	name: cuda-${{ matrix.cudaver }}
	runs-on: ubuntu-latest
	steps:
	- name: Free disk space
	uses: jlumbroso/free-disk-space@main
	with:
	# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
	tool-cache: false
	docker-images: false
	# All of these default to true, but feel free to set to "false" if necessary for your workflow
	android: true
	dotnet: true
	haskell: true
	large-packages: true
	swap-storage: false
	- name: Checkout repository
	uses: actions/checkout@v3
	- name: Build
	uses: addnab/docker-run-action@v3
	with:
	image: openmmlab/lmdeploy-builder:cuda${{ matrix.cudaver }}
	options: -v ${{ github.workspace }}:/work
	run: \|
	cd /work
	source /opt/conda/bin/activate
	conda activate py38
	mkdir build && cd build
	bash ../generate.sh make
	make -j$(nproc) && make install

Provide feedback