From d749578ef5df8cd286deeeea170cf62af4693f15 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Fri, 30 Jun 2017 12:00:26 -0700
Subject: [PATCH] Improve the Python audioToInputVector implementation

---
 util/audio.py | 72 +++++++++++++++------------------------------------
 1 file changed, 21 insertions(+), 51 deletions(-)

diff --git a/util/audio.py b/util/audio.py
index 420f72d5..3b6946d0 100644
--- a/util/audio.py
+++ b/util/audio.py
@@ -12,70 +12,40 @@ except ImportError:
 
     def audioToInputVector(audio, fs, numcep, numcontext):
         if DeprecationWarning.displayed is not True:
+            DeprecationWarning.displayed = True
             print('------------------------------------------------------------------------')
             print('WARNING: libdeepspeech failed to load, resorting to deprecated code')
             print('         Refer to README.md for instructions on installing libdeepspeech')
             print('------------------------------------------------------------------------')
-            DeprecationWarning.displayed = True
 
         # Get mfcc coefficients
-        orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
+        features = mfcc(audio, samplerate=fs, numcep=numcep)
 
         # We only keep every second feature (BiRNN stride = 2)
-        orig_inputs = orig_inputs[::2]
+        features = features[::2]
 
-        # For each time slice of the training set, we need to copy the context this makes
-        # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
-        # because of:
-        #  - numcep dimensions for the current mfcc feature set
-        #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
-        # => so numcep + 2*numcontext*numcep
-        train_inputs = np.array([], np.float32)
-        train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
+        # One stride per time step in the input
+        num_strides = len(features)
 
-        # Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
-        empty_mfcc = np.array([])
-        empty_mfcc.resize((numcep))
+        # Add empty initial and final contexts
+        empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
+        features = np.concatenate((empty_context, features, empty_context))
 
-        # Prepare train_inputs with past and future contexts
-        time_slices = list(range(train_inputs.shape[0]))
-        context_past_min   = time_slices[0]  + numcontext
-        context_future_max = time_slices[-1] - numcontext
-        for time_slice in time_slices:
-            ### Reminder: array[start:stop:step]
-            ### slices from indice |start| up to |stop| (not included), every |step|
-            # Pick up to numcontext time slices in the past, and complete with empty
-            # mfcc features
-            need_empty_past     = max(0, (context_past_min - time_slice))
-            empty_source_past   = list(empty_mfcc for empty_slots in range(need_empty_past))
-            data_source_past    = orig_inputs[max(0, time_slice - numcontext):time_slice]
-            assert(len(empty_source_past) + len(data_source_past) == numcontext)
+        # Create a view into the array with overlapping strides of size
+        # numcontext (past) + 1 (present) + numcontext (future)
+        window_size = 2*numcontext+1
+        train_inputs = np.lib.stride_tricks.as_strided(
+            features,
+            (num_strides, window_size, numcep),
+            (features.strides[0], features.strides[0], features.strides[1]),
+            writeable=False)
 
-            # Pick up to numcontext time slices in the future, and complete with empty
-            # mfcc features
-            need_empty_future   = max(0, (time_slice - context_future_max))
-            empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
-            data_source_future  = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
-            assert(len(empty_source_future) + len(data_source_future) == numcontext)
+        # Flatten the second and third dimensions
+        train_inputs = np.reshape(train_inputs, [num_strides, -1])
 
-            if need_empty_past:
-                past   = np.concatenate((empty_source_past, data_source_past))
-            else:
-                past   = data_source_past
-
-            if need_empty_future:
-                future = np.concatenate((data_source_future, empty_source_future))
-            else:
-                future = data_source_future
-
-            past   = np.reshape(past, numcontext*numcep)
-            now    = orig_inputs[time_slice]
-            future = np.reshape(future, numcontext*numcep)
-
-            train_inputs[time_slice] = np.concatenate((past, now, future))
-            assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
-
-        # Whiten inputs (TODO: Should we whiten)
+        # Whiten inputs (TODO: Should we whiten?)
+        # Copy the strided array so that we can write to it safely
+        train_inputs = np.copy(train_inputs)
         train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
 
         # Return results