mirror of
https://github.com/mozilla/DeepSpeech.git
synced 2025-10-26 11:19:39 +00:00
85 lines
2.8 KiB
C
85 lines
2.8 KiB
C
|
|
#ifndef __DEEPSPEECH_H__
|
|
#define __DEEPSPEECH_H__
|
|
|
|
typedef struct _DeepSpeechContext DeepSpeechContext;
|
|
|
|
/**
|
|
* @brief Initialise a DeepSpeech context.
|
|
*
|
|
* @param aModelPath The path to the frozen model graph.
|
|
* @param aNCep The number of cepstrum the model was trained with.
|
|
* @param aNContext The context window the model was trained with.
|
|
*
|
|
* @return A DeepSpeech context.
|
|
*/
|
|
DeepSpeechContext* DsInit(const char* aModelPath, int aNCep, int aNContext);
|
|
|
|
/**
|
|
* @brief De-initialise a DeepSpeech context.
|
|
*
|
|
* @param aCtx A DeepSpeech context.
|
|
*/
|
|
void DsClose(DeepSpeechContext* aCtx);
|
|
|
|
/**
|
|
* @brief Extract MFCC features from a given audio signal and add context.
|
|
*
|
|
* Extracts MFCC features from a given audio signal and adds the appropriate
|
|
* amount of context to run inference with the given DeepSpeech context.
|
|
*
|
|
* @param aCtx A DeepSpeech context.
|
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
|
|
* rate.
|
|
* @param aBufferSize The sample-length of the audio signal.
|
|
* @param aSampleRate The sample-rate of the audio signal.
|
|
* @param[out] aMFCC An array containing features, of shape
|
|
* (frames, ncep * ncontext). The user is responsible for
|
|
* freeing each row in the array, as well as the array itself.
|
|
*
|
|
* @return The number of frames in @p aMFCC.
|
|
*/
|
|
int DsGetMfccFrames(DeepSpeechContext* aCtx, const short* aBuffer,
|
|
size_t aBufferSize, int aSampleRate, float*** aMfcc);
|
|
|
|
/**
|
|
* @brief Free an array of MFCC features.
|
|
*
|
|
* Frees an array of MFCC features returned by DsGetMfccFrames().
|
|
*
|
|
* @param aMfcc An MFCC features array.
|
|
* @param aNFrames The number of frames in @p aMfcc.
|
|
*/
|
|
void DsFreeMfccFrames(float** aMfcc, int aNFrames);
|
|
|
|
/**
|
|
* @brief Run inference on the given audio.
|
|
*
|
|
* Runs inference on the given MFCC audio features with the given DeepSpeech
|
|
* context. See DsGetMfccFrames().
|
|
*
|
|
* @param aCtx A DeepSpeech context.
|
|
* @param aMfcc MFCC features with the appropriate amount of context per frame.
|
|
* @param aNFrames The number of frames in @p aMfcc.
|
|
*
|
|
* @return The resulting string after running inference. The user is
|
|
* responsible for freeing this string.
|
|
*/
|
|
char* DsInfer(DeepSpeechContext* aCtx, float** aMfcc, int aNFrames);
|
|
|
|
/**
|
|
* @brief Use DeepSpeech to perform Speech-To-Text.
|
|
*
|
|
* @param aMfcc An MFCC features array.
|
|
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
|
|
* rate.
|
|
* @param aBufferSize The number of samples in the audio signal.
|
|
* @param aSampleRate The sample-rate of the audio signal.
|
|
*
|
|
* @return The STT result. The user is responsible for freeing this string.
|
|
*/
|
|
char* DsSTT(DeepSpeechContext* aCtx, const short* aBuffer, size_t aBufferSize,
|
|
int aSampleRate);
|
|
|
|
#endif /* __DEEPSPEECH_H__ */
|