From d749578ef5df8cd286deeeea170cf62af4693f15 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 30 Jun 2017 12:00:26 -0700 Subject: [PATCH] Improve the Python audioToInputVector implementation --- util/audio.py | 72 +++++++++++++++------------------------------------ 1 file changed, 21 insertions(+), 51 deletions(-) diff --git a/util/audio.py b/util/audio.py index 420f72d5..3b6946d0 100644 --- a/util/audio.py +++ b/util/audio.py @@ -12,70 +12,40 @@ except ImportError: def audioToInputVector(audio, fs, numcep, numcontext): if DeprecationWarning.displayed is not True: + DeprecationWarning.displayed = True print('------------------------------------------------------------------------') print('WARNING: libdeepspeech failed to load, resorting to deprecated code') print(' Refer to README.md for instructions on installing libdeepspeech') print('------------------------------------------------------------------------') - DeprecationWarning.displayed = True # Get mfcc coefficients - orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep) + features = mfcc(audio, samplerate=fs, numcep=numcep) # We only keep every second feature (BiRNN stride = 2) - orig_inputs = orig_inputs[::2] + features = features[::2] - # For each time slice of the training set, we need to copy the context this makes - # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions - # because of: - # - numcep dimensions for the current mfcc feature set - # - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set - # => so numcep + 2*numcontext*numcep - train_inputs = np.array([], np.float32) - train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext)) + # One stride per time step in the input + num_strides = len(features) - # Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence) - empty_mfcc = np.array([]) - empty_mfcc.resize((numcep)) + # Add empty initial and final contexts + empty_context = np.zeros((numcontext, numcep), dtype=features.dtype) + features = np.concatenate((empty_context, features, empty_context)) - # Prepare train_inputs with past and future contexts - time_slices = list(range(train_inputs.shape[0])) - context_past_min = time_slices[0] + numcontext - context_future_max = time_slices[-1] - numcontext - for time_slice in time_slices: - ### Reminder: array[start:stop:step] - ### slices from indice |start| up to |stop| (not included), every |step| - # Pick up to numcontext time slices in the past, and complete with empty - # mfcc features - need_empty_past = max(0, (context_past_min - time_slice)) - empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past)) - data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice] - assert(len(empty_source_past) + len(data_source_past) == numcontext) + # Create a view into the array with overlapping strides of size + # numcontext (past) + 1 (present) + numcontext (future) + window_size = 2*numcontext+1 + train_inputs = np.lib.stride_tricks.as_strided( + features, + (num_strides, window_size, numcep), + (features.strides[0], features.strides[0], features.strides[1]), + writeable=False) - # Pick up to numcontext time slices in the future, and complete with empty - # mfcc features - need_empty_future = max(0, (time_slice - context_future_max)) - empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future)) - data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1] - assert(len(empty_source_future) + len(data_source_future) == numcontext) + # Flatten the second and third dimensions + train_inputs = np.reshape(train_inputs, [num_strides, -1]) - if need_empty_past: - past = np.concatenate((empty_source_past, data_source_past)) - else: - past = data_source_past - - if need_empty_future: - future = np.concatenate((data_source_future, empty_source_future)) - else: - future = data_source_future - - past = np.reshape(past, numcontext*numcep) - now = orig_inputs[time_slice] - future = np.reshape(future, numcontext*numcep) - - train_inputs[time_slice] = np.concatenate((past, now, future)) - assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext) - - # Whiten inputs (TODO: Should we whiten) + # Whiten inputs (TODO: Should we whiten?) + # Copy the strided array so that we can write to it safely + train_inputs = np.copy(train_inputs) train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs) # Return results