DeepSpeech/native_client/python/__init__.py

import os
import platform

#The API is not snake case which triggers linter errors
#pylint: disable=invalid-name

if platform.system().lower() == "windows":
    dslib_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'lib')

    # On Windows, we can't rely on RPATH being set to $ORIGIN/lib/ or on
    # @loader_path/lib
    if hasattr(os, 'add_dll_directory'):
        # Starting with Python 3.8 this properly handles the problem
        os.add_dll_directory(dslib_path)
    else:
        # Before Pythin 3.8 we need to change the PATH to include the proper
        # directory for the dynamic linker
        os.environ['PATH'] = dslib_path + ';' + os.environ['PATH']

import deepspeech

# rename for backwards compatibility
from deepspeech.impl import Version as version

class Model(object):
    """
    Class holding a DeepSpeech model

    :param aModelPath: Path to model file to load
    :type aModelPath: str
    """
    def __init__(self, model_path):
        # make sure the attribute is there if CreateModel fails
        self._impl = None

        status, impl = deepspeech.impl.CreateModel(model_path)
        if status != 0:
            raise RuntimeError("CreateModel failed with error code 0x{:X}".format(status))
        self._impl = impl

    def __del__(self):
        if self._impl:
            deepspeech.impl.FreeModel(self._impl)
            self._impl = None

    def beamWidth(self):
        """
        Get beam width value used by the model. If setModelBeamWidth was not
        called before, will return the default value loaded from the model file.

        :return: Beam width value used by the model.
        :type: int
        """
        return deepspeech.impl.GetModelBeamWidth(self._impl)

    def setBeamWidth(self, beam_width):
        """
        Set beam width value used by the model.

        :param beam_width: The beam width used by the model. A larger beam width value generates better results at the cost of decoding time.
        :type beam_width: int

        :return: Zero on success, non-zero on failure.
        :type: int
        """
        return deepspeech.impl.SetModelBeamWidth(self._impl, beam_width)

    def sampleRate(self):
        """
        Return the sample rate expected by the model.

        :return: Sample rate.
        :type: int
        """
        return deepspeech.impl.GetModelSampleRate(self._impl)

    def enableExternalScorer(self, scorer_path):
        """
        Enable decoding using an external scorer.

        :param scorer_path: The path to the external scorer file.
        :type scorer_path: str

        :return: Zero on success, non-zero on failure.
        :type: int
        """
        return deepspeech.impl.EnableExternalScorer(self._impl, scorer_path)

    def disableExternalScorer(self):
        """
        Disable decoding using an external scorer.

        :return: Zero on success, non-zero on failure.
        """
        return deepspeech.impl.DisableExternalScorer(self._impl)

    def setScorerAlphaBeta(self, alpha, beta):
        """
        Set hyperparameters alpha and beta of the external scorer.

        :param alpha: The alpha hyperparameter of the decoder. Language model weight.
        :type alpha: float

        :param beta: The beta hyperparameter of the decoder. Word insertion weight.
        :type beta: float

        :return: Zero on success, non-zero on failure.
        :type: int
        """
        return deepspeech.impl.SetScorerAlphaBeta(self._impl, alpha, beta)

    def stt(self, audio_buffer):
        """
        Use the DeepSpeech model to perform Speech-To-Text.

        :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type audio_buffer: numpy.int16 array

        :return: The STT result.
        :type: str
        """
        return deepspeech.impl.SpeechToText(self._impl, audio_buffer)

    def sttWithMetadata(self, audio_buffer):
        """
        Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.

        :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type audio_buffer: numpy.int16 array

        :return: Outputs a struct of individual letters along with their timing information.
        :type: :func:`Metadata`
        """
        return deepspeech.impl.SpeechToTextWithMetadata(self._impl, audio_buffer)

    def createStream(self):
        """
        Create a new streaming inference state. The streaming state returned by
        this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.

        :return: Stream object representing the newly created stream
        :type: :func:`Stream`

        :throws: RuntimeError on error
        """
        status, ctx = deepspeech.impl.CreateStream(self._impl)
        if status != 0:
            raise RuntimeError("CreateStream failed with error code 0x{:X}".format(status))
        return Stream(ctx)


class Stream(object):
    """
    Class wrapping a DeepSpeech stream. The constructor cannot be called directly.
    Use :func:`Model.createStream()`
    """
    def __init__(self, native_stream):
        self._impl = native_stream

    def __del__(self):
        if self._impl:
            self.freeStream()

    def feedAudioContent(self, audio_buffer):
        """
        Feed audio samples to an ongoing streaming inference.

        :param audio_buffer: A 16-bit, mono raw audio signal at the appropriate sample rate (matching what the model was trained on).
        :type audio_buffer: numpy.int16 array

        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to feed an already finished stream?")
        deepspeech.impl.FeedAudioContent(self._impl, audio_buffer)

    def intermediateDecode(self):
        """
        Compute the intermediate decoding of an ongoing streaming inference.

        :return: The STT intermediate result.
        :type: str

        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to decode an already finished stream?")
        return deepspeech.impl.IntermediateDecode(self._impl)

    def finishStream(self):
        """
        Signal the end of an audio signal to an ongoing streaming inference,
        returns the STT result over the whole audio signal.

        :return: The STT result.
        :type: str

        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
        result = deepspeech.impl.FinishStream(self._impl)
        self._impl = None
        return result

    def finishStreamWithMetadata(self):
        """
        Signal the end of an audio signal to an ongoing streaming inference,
        returns per-letter metadata.

        :return: Outputs a struct of individual letters along with their timing information.
        :type: :func:`Metadata`

        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to finish an already finished stream?")
        result = deepspeech.impl.FinishStreamWithMetadata(self._impl)
        self._impl = None
        return result

    def freeStream(self):
        """
        Destroy a streaming state without decoding the computed logits. This can
        be used if you no longer need the result of an ongoing streaming inference.

        :throws: RuntimeError if the stream object is not valid
        """
        if not self._impl:
            raise RuntimeError("Stream object is not valid. Trying to free an already finished stream?")
        deepspeech.impl.FreeStream(self._impl)
        self._impl = None


# This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
class MetadataItem(object):
    """
    Stores each individual character, along with its timing information
    """

    def character(self):
        """
        The character generated for transcription
        """


    def timestep(self):
        """
        Position of the character in units of 20ms
        """


    def start_time(self):
        """
        Position of the character in seconds
        """


class Metadata(object):
    """
    Stores the entire CTC output as an array of character metadata objects
    """
    def items(self):
        """
        List of items

        :return: A list of :func:`MetadataItem` elements
        :type: list
        """


    def num_items(self):
        """
        Size of the list of items

        :return: Size of the list of items
        :type: int
        """


    def confidence(self):
        """
        Approximated confidence value for this transcription. This is roughly the
        sum of the acoustic model logit values for each timestep/character that
        contributed to the creation of this transcription.
        """