forked from mozilla/DeepSpeech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeepspeech.h
277 lines (251 loc) · 10.1 KB
/
deepspeech.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#ifndef DEEPSPEECH_H
#define DEEPSPEECH_H
#ifndef SWIG
#if defined _MSC_VER
#define DEEPSPEECH_EXPORT extern "C" __declspec(dllexport)
#else /*End of _MSC_VER*/
#define DEEPSPEECH_EXPORT __attribute__ ((visibility("default")))
#endif /*End of SWIG*/
#else
#define DEEPSPEECH_EXPORT
#endif
struct ModelState;
struct StreamingState;
// Stores each individual character, along with its timing information
struct MetadataItem {
char* character;
int timestep; // Position of the character in units of 20ms
float start_time; // Position of the character in seconds
};
// Stores the entire CTC output as an array of character metadata objects
struct Metadata {
MetadataItem* items;
int num_items;
};
enum DeepSpeech_Error_Codes
{
// OK
DS_ERR_OK = 0x0000,
// Missing invormations
DS_ERR_NO_MODEL = 0x1000,
// Invalid parameters
DS_ERR_INVALID_ALPHABET = 0x2000,
DS_ERR_INVALID_SHAPE = 0x2001,
DS_ERR_INVALID_LM = 0x2002,
// Runtime failures
DS_ERR_FAIL_INIT_MMAP = 0x3000,
DS_ERR_FAIL_INIT_SESS = 0x3001,
DS_ERR_FAIL_INTERPRETER = 0x3002,
DS_ERR_FAIL_RUN_SESS = 0x3003,
DS_ERR_FAIL_CREATE_STREAM = 0x3004,
DS_ERR_FAIL_READ_PROTOBUF = 0x3005,
DS_ERR_FAIL_CREATE_SESS = 0x3006,
};
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep The number of cepstrum the model was trained with.
* @param aNContext The context window the model was trained with.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aBeamWidth The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
* @param[out] retval a ModelState pointer
*
* @return Zero on success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_CreateModel(const char* aModelPath,
unsigned int aNCep,
unsigned int aNContext,
const char* aAlphabetConfigPath,
unsigned int aBeamWidth,
ModelState** retval);
/**
* @brief Frees associated resources and destroys model object.
*/
DEEPSPEECH_EXPORT
void DS_DestroyModel(ModelState* ctx);
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.
* @param aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model
weight.
* @param aLMBeta The beta hyperparameter of the CTC decoder. Word insertion
weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
DEEPSPEECH_EXPORT
int DS_EnableDecoderWithLM(ModelState* aCtx,
const char* aAlphabetConfigPath,
const char* aLMPath,
const char* aTriePath,
float aLMAlpha,
float aLMBeta);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return The STT result. The user is responsible for freeing the string.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate);
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
*/
DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate);
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to {@link DS_FeedAudioContent()}
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aPreAllocFrames Number of timestep frames to reserve. One timestep
* is equivalent to two window lengths (20ms). If set to
* 0 we reserve enough frames for 3 seconds of audio (150).
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_SetupStream(ModelState* aCtx,
unsigned int aPreAllocFrames,
unsigned int aSampleRate,
StreamingState** retval);
/**
* @brief Feed audio samples to an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* @param aBufferSize The number of samples in @p aBuffer.
*/
DEEPSPEECH_EXPORT
void DS_FeedAudioContent(StreamingState* aSctx,
const short* aBuffer,
unsigned int aBufferSize);
/**
* @brief Compute the intermediate decoding of an ongoing streaming inference.
* This is an expensive process as the decoder implementation isn't
* currently capable of streaming, so it always starts from the beginning
* of the audio.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
*
* @return The STT intermediate result. The user is responsible for freeing the
* string.
*/
DEEPSPEECH_EXPORT
char* DS_IntermediateDecode(StreamingState* aSctx);
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
*
* @return The STT result. The user is responsible for freeing the string.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
char* DS_FinishStream(StreamingState* aSctx);
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
*
* @return Outputs a struct of individual letters along with their timing information.
* The user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. Returns NULL on error.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx);
/**
* @brief Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_SetupStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
void DS_DiscardStream(StreamingState* aSctx);
/**
* @brief Given audio, return a vector suitable for input to a DeepSpeech
* model trained with the given parameters.
*
* Extracts MFCC features from a given audio signal and adds the appropriate
* amount of context to run inference on a DeepSpeech model trained with
* the given parameters.
*
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
* rate.
* @param aBufferSize The sample-length of the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* @param aNCep The number of cepstrum.
* @param aNContext The size of the context window.
* @param[out] aMfcc An array containing features, of shape
* (@p aNFrames, ncep * ncontext). The user is responsible
* for freeing the array.
* @param[out] aNFrames (optional) The number of frames in @p aMfcc.
* @param[out] aFrameLen (optional) The length of each frame
* (ncep * ncontext) in @p aMfcc.
*/
DEEPSPEECH_EXPORT
void DS_AudioToInputVector(const short* aBuffer,
unsigned int aBufferSize,
unsigned int aSampleRate,
unsigned int aNCep,
unsigned int aNContext,
float** aMfcc,
int* aNFrames = NULL,
int* aFrameLen = NULL);
/**
* @brief Free memory allocated for metadata information.
*/
DEEPSPEECH_EXPORT
void DS_FreeMetadata(Metadata* m);
/**
* @brief Print version of this library and of the linked TensorFlow library.
*/
DEEPSPEECH_EXPORT
void DS_PrintVersions();
#undef DEEPSPEECH_EXPORT
#endif /* DEEPSPEECH_H */