-
Notifications
You must be signed in to change notification settings - Fork 205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a choice of how to end streaming from callback: STOP or CANCEL #1476
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,8 +15,8 @@ enum class GenerationStatus { | |
RUNNING = 0, // Default status for ongoing generation | ||
FINISHED = 1, // Status set when generation has been finished | ||
IGNORED = 2, // Status set when generation run into out-of-memory condition and could not be continued | ||
DROPPED_BY_PIPELINE = 3, // Currently not used, TODO: implement abort functionality | ||
DROPPED_BY_HANDLE = 4 // Status set when generation handle is dropped | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's deprecate |
||
CANCEL = 3, // Status set when generation handle is canceled | ||
STOP = 4 // Status set when generation handle is stopped | ||
}; | ||
|
||
struct EncodedGenerationResult { | ||
|
@@ -70,10 +70,10 @@ using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>; | |
|
||
class GenerationStream; | ||
|
||
class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { | ||
class OPENVINO_GENAI_EXPORTS | ||
GenerationHandleImpl { | ||
std::shared_ptr<GenerationStream> m_generation_stream; | ||
ov::genai::GenerationConfig m_sampling_params; | ||
|
||
ov::genai::GenerationConfig m_sampling_params; | ||
public: | ||
GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) : | ||
m_generation_stream(std::move(generation_stream)), | ||
|
@@ -88,10 +88,18 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { | |
GenerationStatus get_status(); | ||
|
||
bool can_read(); | ||
bool is_dropped(); | ||
|
||
bool is_stopped(); | ||
|
||
bool is_canceled(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
OPENVINO_DEPRECATED("Please, use `stop()` instead of `drop()`.") | ||
void drop(); | ||
|
||
void stop(); | ||
|
||
void cancel(); | ||
|
||
GenerationOutputs back(); | ||
// Reads result of a generation for single iteration | ||
GenerationOutputs read(); | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,16 +4,29 @@ | |||||
#pragma once | ||||||
|
||||||
#include "openvino/genai/tokenizer.hpp" | ||||||
#include "openvino/genai/generation_handle.hpp" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like this header file is not required here anymore |
||||||
#include <variant> | ||||||
|
||||||
namespace ov { | ||||||
namespace genai { | ||||||
|
||||||
enum class StreamerRunningStatus { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
RUNNING = 0, // Continue to run of inference | ||||||
STOP = 1, // Stop generation, keep history as is, KV cache includes last request and generated tokens | ||||||
CANCEL = 2 // Stop generate, drop last prompt and all generated tokens from history, KV cache include history but last step | ||||||
}; | ||||||
|
||||||
using CallbackTypeVariant = std::variant<bool, StreamerRunningStatus, std::monostate>; | ||||||
|
||||||
/** | ||||||
* @brief base class for streamers. In order to use inherit from from this class and implement put, and methods | ||||||
* | ||||||
* @param m_tokenizer tokenizer | ||||||
*/ | ||||||
class OPENVINO_GENAI_EXPORTS StreamerBase { | ||||||
protected: | ||||||
StreamerRunningStatus m_streaming_finish_status = StreamerRunningStatus::RUNNING; | ||||||
|
||||||
public: | ||||||
/// @brief put is called every time new token is decoded, | ||||||
/// @return bool flag to indicate whether generation should be stopped, if return true generation stops | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we can add a new function CC @Wovchena @sbalandi @as-suvorov what do you think? BTW, if you are OK with new method, note, that we need to select more or less generic name, which will allow to put a single token or multiple tokens (Whisper / Spec Dec cases) |
||||||
|
@@ -22,6 +35,12 @@ class OPENVINO_GENAI_EXPORTS StreamerBase { | |||||
/// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one | ||||||
virtual void end() = 0; | ||||||
|
||||||
/// @brief get_streaming_status() is called by the pipline to take more detailed about streaming status. m_streaming_finish_status, which contains streaming status info, could be set in put(). | ||||||
/// @return ov::genai::StreamerRunningStatus to determine the streaming status of generation, whether generation is running, stopped or cancelled | ||||||
virtual StreamerRunningStatus get_streaming_status() { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
return m_streaming_finish_status; | ||||||
} | ||||||
|
||||||
virtual ~StreamerBase(); | ||||||
}; | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -9,32 +9,44 @@ | |||||
using namespace ov::genai; | ||||||
|
||||||
GenerationHandleImpl::~GenerationHandleImpl() { | ||||||
drop(); | ||||||
stop(); | ||||||
} | ||||||
|
||||||
GenerationStatus GenerationHandleImpl::get_status() { | ||||||
return m_generation_stream->get_status(); | ||||||
} | ||||||
|
||||||
bool GenerationHandleImpl::can_read() { | ||||||
return !is_dropped() && m_generation_stream->can_read(); | ||||||
return !is_canceled() && !is_stopped() && m_generation_stream->can_read(); | ||||||
} | ||||||
|
||||||
bool GenerationHandleImpl::is_dropped() { | ||||||
return get_status() == GenerationStatus::DROPPED_BY_HANDLE; | ||||||
bool GenerationHandleImpl::is_stopped() { | ||||||
return get_status() == GenerationStatus::STOP; | ||||||
} | ||||||
|
||||||
bool GenerationHandleImpl::is_canceled() { | ||||||
return get_status() == GenerationStatus::CANCEL; | ||||||
} | ||||||
|
||||||
void GenerationHandleImpl::drop() { | ||||||
m_generation_stream->drop(); | ||||||
m_generation_stream->stop(); | ||||||
} | ||||||
|
||||||
void GenerationHandleImpl::stop() { | ||||||
m_generation_stream->stop(); | ||||||
} | ||||||
|
||||||
void GenerationHandleImpl::cancel() { | ||||||
m_generation_stream->cancel(); | ||||||
} | ||||||
|
||||||
std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::back() { | ||||||
OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped."); | ||||||
OPENVINO_ASSERT(!is_stopped(), "GenerationHandle cannot be used after it is stopped."); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
the same in other places. |
||||||
return m_generation_stream->back(); | ||||||
} | ||||||
|
||||||
std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::read() { | ||||||
OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped."); | ||||||
OPENVINO_ASSERT(!is_stopped(), "GenerationHandle cannot be used after it is stopped."); | ||||||
return m_generation_stream->read(); | ||||||
} | ||||||
|
||||||
|
@@ -57,7 +69,7 @@ void add_partial_result(std::unordered_map<uint64_t, GenerationOutput>& partial_ | |||||
} | ||||||
|
||||||
std::vector<GenerationOutput> GenerationHandleImpl::read_all() { | ||||||
OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped."); | ||||||
OPENVINO_ASSERT(!is_stopped(), "GenerationHandle cannot be used after it is stopped."); | ||||||
std::vector<GenerationOutput> results; | ||||||
std::unordered_map<uint64_t, GenerationOutput> partial_results; | ||||||
// We iterate until generation is running or there are tokens we haven't read yet | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,7 +89,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( | |
const auto decode_start = std::chrono::steady_clock::now(); | ||
generated.push_back(m_tokenizer.decode(res.m_generation_ids.at(idx))); | ||
raw_counters.detokenization_durations.emplace_back(std::chrono::steady_clock::now() - decode_start); | ||
if (m_is_chat_conversation && 0 == idx) { | ||
if (m_is_chat_conversation && 0 == idx && res.m_status != ov::genai::GenerationStatus::CANCEL) { | ||
m_history.push_back({{"role", "assistant"}, {"content", generated.back()}}); | ||
} | ||
} | ||
|
@@ -110,6 +110,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( | |
}); | ||
} | ||
|
||
// if streaming was canceled, prompt/answer of current step shouldn't be presented in history, so let's remove prompt from history | ||
if (m_is_chat_conversation && !encoded.empty() && encoded[0].m_status == ov::genai::GenerationStatus::CANCEL) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need |
||
m_history.pop_back(); | ||
|
||
return decoded; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.