DeepSpeech/examples/net_framework/DeepSpeechWPF/MainWindow.xaml.cs

using CSCore;
using CSCore.CoreAudioAPI;
using CSCore.SoundIn;
using CSCore.Streams;
using DeepSpeechClient.Interfaces;
using Microsoft.Win32;
using System;
using System.Collections.Concurrent;
using System.ComponentModel;
using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using System.Windows;

namespace DeepSpeechWPF
{
    /// <summary>
    /// Interaction logic for MainWindow.xaml
    /// </summary>
    public partial class MainWindow : Window
    {
        private readonly IDeepSpeech _sttClient;

        private const uint N_CEP = 26;
        private const uint N_CONTEXT = 9;
        private const uint BEAM_WIDTH = 500;
        private const float LM_ALPHA = 0.75f;
        private const float LM_BETA = 1.85f;


        #region Streaming

        private readonly WasapiCapture _audioCapture;

        private MMDeviceCollection _audioCaptureDevices;
        private SoundInSource _soundInSource;
        private IWaveSource _convertedSource;

        /// <summary>
        /// Queue that prevents feeding data to the inference engine if it is busy.
        /// </summary>
        private ConcurrentQueue<short[]> _bufferQueue = new ConcurrentQueue<short[]>();

        private int _threadSafeBoolBackValue = 0;

        /// <summary>
        /// Lock to process items in the queue one at time.
        /// </summary>
        public bool IsBusy
        {
            get => (Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 1, 1) == 1);
            set
            {
                if (value) Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 1, 0);
                else Interlocked.CompareExchange(ref _threadSafeBoolBackValue, 0, 1);
            }
        }
        #endregion

        public MainWindow()
        {
            InitializeComponent();
            _sttClient = new DeepSpeechClient.DeepSpeech();

            //if you want to record from the mic change to this
            //_audioCapture = new WasapiCapture();

            //we capture the windows audio output
            _audioCapture = new WasapiLoopbackCapture();
        }

        private void Window_Loaded(object sender, RoutedEventArgs e)
        {
            LoadAvailableCaptureDevices();

            Task.Run(()=>
            {
                try
                {
                    _sttClient.CreateModel("output_graph.pbmm", N_CEP, N_CONTEXT, "alphabet.txt", BEAM_WIDTH);
                    Dispatcher.Invoke(() => { EnableControls(); });
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                    Close();
                }
            });
        }

        /// <summary>
        /// Loads all the available audio capture devices.
        /// </summary>
        private void LoadAvailableCaptureDevices()
        {
            DataFlow dataFlow = DataFlow.Render;  //Use render to get output devices

            // Use capture to get input devices such a microphone
            // DataFlow dataFlow = DataFlow.Capture;

            _audioCaptureDevices = MMDeviceEnumerator.EnumerateDevices(dataFlow, DeviceState.Active); //we get only enabled devices
            foreach (var device in _audioCaptureDevices)
            {
                cbxAudioInputs.Items.Add(device.FriendlyName);
            }
            if (_audioCaptureDevices.Count > 0)
            {
                cbxAudioInputs.SelectedIndex = 0;
            }
        }

        private void EnableControls()
        {
            btnEnableLM.IsEnabled = true;
            btnOpenFile.IsEnabled = true;
            btnStartRecording.IsEnabled = true;
        }

        private async void BtnTranscript_Click(object sender, RoutedEventArgs e)
        {
            txtResult.Text = string.Empty;
            btnTranscript.IsEnabled = false;
            lblStatus.Content = "Running inference...";
            Stopwatch watch = new Stopwatch();
            var waveBuffer = new NAudio.Wave.WaveBuffer(File.ReadAllBytes(txtFileName.Text));
            using (var waveInfo = new NAudio.Wave.WaveFileReader(txtFileName.Text))
            {
                Console.WriteLine("Running inference....");

                watch.Start();
                await Task.Run(() =>
                {
                    string speechResult = _sttClient.SpeechToText(waveBuffer.ShortBuffer, Convert.ToUInt32(waveBuffer.MaxSize / 2), 16000);
                    watch.Stop();
                    Dispatcher.Invoke(() =>
                    {
                        txtResult.Text = $"Audio duration: {waveInfo.TotalTime.ToString()} {Environment.NewLine}" +
                            $"Inference took: {watch.Elapsed.ToString()} {Environment.NewLine}" +
                            $"Recognized text: {speechResult}";
                    });
                });
            }
            waveBuffer.Clear();
            lblStatus.Content = string.Empty;
            btnTranscript.IsEnabled = true;
        }

        private async void BtnEnableLM_Click(object sender, RoutedEventArgs e)
        {
            lblStatus.Content = "Loading LM.....";
            btnEnableLM.IsEnabled = false;
            await Task.Run(() =>
            {
                try
                {
                    _sttClient.EnableDecoderWithLM("alphabet.txt", "lm.binary", "trie", LM_ALPHA, LM_BETA);
                    Dispatcher.Invoke(() => lblStatus.Content = "LM loaded.");
                }
                catch (Exception ex)
                {
                    Dispatcher.Invoke(() => btnEnableLM.IsEnabled = true);
                    MessageBox.Show(ex.Message);
                }
            });
        }

        private void BtnOpenFile_Click(object sender, RoutedEventArgs e)
        {
            OpenFileDialog dialog = new OpenFileDialog
            {
                Filter = "wav Files |*.wav",
                Multiselect = false,
                Title = "Please select a wav file."
            };

            if ((bool)dialog.ShowDialog())
            {
                txtFileName.Text = dialog.FileName;
                btnTranscript.IsEnabled = true;
            }
        }

        protected override void OnClosing(CancelEventArgs e)
        {
            _sttClient.Dispose();
            base.OnClosing(e);
        }

        private void CbxAudioInputs_SelectionChanged(object sender, System.Windows.Controls.SelectionChangedEventArgs e)
        {
            btnStartRecording.IsEnabled = false;
            btnStopRecording.IsEnabled = false;
            if (_audioCapture.RecordingState == RecordingState.Recording)
            {
                _audioCapture.Stop();
                _soundInSource.Dispose();
                _convertedSource.Dispose();
                _audioCapture.DataAvailable -= _capture_DataAvailable;
                _sttClient.DiscardStream(); //this a good example of discardstream, the user changed the audio input, so we no longer need the current stream
            }
            if (_audioCaptureDevices!=null)
            {
                _audioCapture.Device = _audioCaptureDevices[cbxAudioInputs.SelectedIndex];
            }
            InitilizeAudioCapture();
        }


        /// <summary>
        /// Initializes the recorder and setup the native stream.
        /// </summary>
        private void InitilizeAudioCapture()
        {
            _audioCapture.Initialize();
            _audioCapture.DataAvailable += _capture_DataAvailable;
            _soundInSource = new SoundInSource(_audioCapture) { FillWithZeros = false };
            //create a source, that converts the data provided by the
            //soundInSource to required by the deepspeech model
            _convertedSource = _soundInSource
               .ChangeSampleRate(16000) // sample rate
               .ToSampleSource()
               .ToWaveSource(16); //bits per sample

            _convertedSource = _convertedSource.ToMono();
            btnStartRecording.IsEnabled = true;

        }

        private void _capture_DataAvailable(object sender, DataAvailableEventArgs e)
        {
            //read data from the converedSource
            //important: don't use the e.Data here
            //the e.Data contains the raw data provided by the
            //soundInSource which won't have the deepspeech required audio format
            byte[] buffer = new byte[_convertedSource.WaveFormat.BytesPerSecond / 2];

            int read;
            //int bytesReadIndex = 0;
            //keep reading as long as we still get some data
            while ((read = _convertedSource.Read(buffer, 0, buffer.Length)) > 0)
            {
                short[] sdata = new short[(int)Math.Ceiling(Convert.ToDecimal(read / 2))];
                Buffer.BlockCopy(buffer, 0, sdata, 0, read);
                _bufferQueue.Enqueue(sdata);
                Task.Run(() => OnNewData());
            }
        }


        private void BtnStartRecording_Click(object sender, RoutedEventArgs e)
        {
            _sttClient.SetupStream(0, 16000);
            _audioCapture.Start();
            btnStartRecording.IsEnabled = false;
            btnStopRecording.IsEnabled = true;
        }

        private async void BtnStopRecording_Click(object sender, RoutedEventArgs e)
        {
            btnStartRecording.IsEnabled = false;
            btnStopRecording.IsEnabled = false;
            _audioCapture.Stop();
            await Task.Run(async () =>
            {
                while (!_bufferQueue.IsEmpty && IsBusy) //we wait for all the queued buffers to be processed
                {
                    await Task.Delay(90);
                }
                string sttResult = _sttClient.FinishStream();
                Dispatcher.Invoke(() => txtResult.Text = sttResult);
            });
            btnStartRecording.IsEnabled = true;
        }

        /// <summary>
        /// Starts processing data from the queue.
        /// </summary>
        private void OnNewData()
        {
            while (!IsBusy && !_bufferQueue.IsEmpty)
            {
                if (_bufferQueue.TryDequeue(out short[] buffer))
                {
                    IsBusy = true;
                    _sttClient.FeedAudioContent(buffer, Convert.ToUInt32(buffer.Length));
                    IsBusy = false;
                }
            }
        }
    }
}