[Core] Optimize request output tokens putting back implementation to reduce overhead #55

	name: offline_inference

	on:
	pull_request:
	branches:
	- main
	push:
	branches:
	- main

	jobs:
	cancel_previous_workflows:
	runs-on: [self-hosted]
	timeout-minutes: 3
	steps:
	- uses: styfle/[email protected]
	with:
	all_but_latest: true

	offline_inference:
	needs: cancel_previous_workflows
	runs-on: [self-hosted]
	timeout-minutes: 10
	steps:
	- uses: actions/checkout@v4
	- name: Run offline inference example
	run: \|
	nvidia-docker run --rm -t --net host --ipc host \
	-v ${PWD}:/workspace \
	-w /workspace \
	registry.cn-beijing.aliyuncs.com/llumnix/llumnix-dev:20240909_action_678a439 \
	bash -c "pip install -e . > /dev/null && python examlpes/offline_inference.py"

Provide feedback