DeepSpeech/native_client/deepspeech.h

85 lines
2.8 KiB
C

#ifndef __DEEPSPEECH_H__
#define __DEEPSPEECH_H__
typedef struct _DeepSpeechContext DeepSpeechContext;
/**
* @brief Initialise a DeepSpeech context.
*
* @param aModelPath The path to the frozen model graph.
* @param aNCep The number of cepstrum the model was trained with.
* @param aNContext The context window the model was trained with.
*
* @return A DeepSpeech context.
*/
DeepSpeechContext* DsInit(const char* aModelPath, int aNCep, int aNContext);
/**
* @brief De-initialise a DeepSpeech context.
*
* @param aCtx A DeepSpeech context.
*/
void DsClose(DeepSpeechContext* aCtx);
/**
* @brief Extract MFCC features from a given audio signal and add context.
*
* Extracts MFCC features from a given audio signal and adds the appropriate
* amount of context to run inference with the given DeepSpeech context.
*
* @param aCtx A DeepSpeech context.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
* rate.
* @param aBufferSize The sample-length of the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
* @param[out] aMFCC An array containing features, of shape
* (frames, ncep * ncontext). The user is responsible for
* freeing each row in the array, as well as the array itself.
*
* @return The number of frames in @p aMFCC.
*/
int DsGetMfccFrames(DeepSpeechContext* aCtx, const short* aBuffer,
size_t aBufferSize, int aSampleRate, float*** aMfcc);
/**
* @brief Free an array of MFCC features.
*
* Frees an array of MFCC features returned by DsGetMfccFrames().
*
* @param aMfcc An MFCC features array.
* @param aNFrames The number of frames in @p aMfcc.
*/
void DsFreeMfccFrames(float** aMfcc, int aNFrames);
/**
* @brief Run inference on the given audio.
*
* Runs inference on the given MFCC audio features with the given DeepSpeech
* context. See DsGetMfccFrames().
*
* @param aCtx A DeepSpeech context.
* @param aMfcc MFCC features with the appropriate amount of context per frame.
* @param aNFrames The number of frames in @p aMfcc.
*
* @return The resulting string after running inference. The user is
* responsible for freeing this string.
*/
char* DsInfer(DeepSpeechContext* aCtx, float** aMfcc, int aNFrames);
/**
* @brief Use DeepSpeech to perform Speech-To-Text.
*
* @param aMfcc An MFCC features array.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate sample
* rate.
* @param aBufferSize The number of samples in the audio signal.
* @param aSampleRate The sample-rate of the audio signal.
*
* @return The STT result. The user is responsible for freeing this string.
*/
char* DsSTT(DeepSpeechContext* aCtx, const short* aBuffer, size_t aBufferSize,
int aSampleRate);
#endif /* __DEEPSPEECH_H__ */