mirror of
https://github.com/mozilla/DeepSpeech.git
synced 2025-10-26 11:19:39 +00:00
208 lines
7.7 KiB
Python
Executable File
208 lines
7.7 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
import itertools
|
|
import json
|
|
import numpy as np
|
|
import os
|
|
import pandas
|
|
import progressbar
|
|
import sys
|
|
import tables
|
|
import tensorflow as tf
|
|
|
|
from collections import namedtuple
|
|
from ds_ctcdecoder import ctc_beam_search_decoder_batch, Scorer
|
|
from multiprocessing import Pool, cpu_count
|
|
from six.moves import zip, range
|
|
from util.audio import audiofile_to_input_vector
|
|
from util.config import Config, initialize_globals
|
|
from util.flags import create_flags, FLAGS
|
|
from util.logging import log_error
|
|
from util.preprocess import preprocess
|
|
from util.text import Alphabet, levenshtein
|
|
from util.evaluate_tools import process_decode_result, calculate_report
|
|
|
|
def split_data(dataset, batch_size):
|
|
remainder = len(dataset) % batch_size
|
|
if remainder != 0:
|
|
dataset = dataset[:-remainder]
|
|
|
|
for i in range(0, len(dataset), batch_size):
|
|
yield dataset[i:i + batch_size]
|
|
|
|
|
|
def pad_to_dense(jagged):
|
|
maxlen = max(len(r) for r in jagged)
|
|
subshape = jagged[0].shape
|
|
|
|
padded = np.zeros((len(jagged), maxlen) +
|
|
subshape[1:], dtype=jagged[0].dtype)
|
|
for i, row in enumerate(jagged):
|
|
padded[i, :len(row)] = row
|
|
return padded
|
|
|
|
|
|
def evaluate(test_data, inference_graph):
|
|
scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
|
|
FLAGS.lm_binary_path, FLAGS.lm_trie_path,
|
|
Config.alphabet)
|
|
|
|
|
|
def create_windows(features):
|
|
num_strides = len(features) - (Config.n_context * 2)
|
|
|
|
# Create a view into the array with overlapping strides of size
|
|
# numcontext (past) + 1 (present) + numcontext (future)
|
|
window_size = 2*Config.n_context+1
|
|
features = np.lib.stride_tricks.as_strided(
|
|
features,
|
|
(num_strides, window_size, Config.n_input),
|
|
(features.strides[0], features.strides[0], features.strides[1]),
|
|
writeable=False)
|
|
|
|
return features
|
|
|
|
# Create overlapping windows over the features
|
|
test_data['features'] = test_data['features'].apply(create_windows)
|
|
|
|
with tf.Session(config=Config.session_config) as session:
|
|
inputs, outputs, layers = inference_graph
|
|
|
|
# Transpose to batch major for decoder
|
|
transposed = tf.transpose(outputs['outputs'], [1, 0, 2])
|
|
|
|
labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
|
|
label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")
|
|
|
|
# We add 1 to all elements of the transcript to avoid any zero values
|
|
# since we use that as an end-of-sequence token for converting the batch
|
|
# into a SparseTensor. So here we convert the placeholder back into a
|
|
# SparseTensor and subtract ones to get the real labels.
|
|
sparse_labels = tf.contrib.layers.dense_to_sparse(labels_ph)
|
|
neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
|
|
sparse_labels = tf.sparse_add(sparse_labels, neg_ones)
|
|
|
|
loss = tf.nn.ctc_loss(labels=sparse_labels,
|
|
inputs=layers['raw_logits'],
|
|
sequence_length=inputs['input_lengths'])
|
|
|
|
# Create a saver using variables from the above newly created graph
|
|
mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
|
|
saver = tf.train.Saver(mapping)
|
|
|
|
# Restore variables from training checkpoint
|
|
checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
|
|
if not checkpoint:
|
|
log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
|
|
exit(1)
|
|
|
|
checkpoint_path = checkpoint.model_checkpoint_path
|
|
saver.restore(session, checkpoint_path)
|
|
|
|
logitses = []
|
|
losses = []
|
|
|
|
print('Computing acoustic model predictions...')
|
|
batch_count = len(test_data) // FLAGS.test_batch_size
|
|
bar = progressbar.ProgressBar(max_value=batch_count,
|
|
widget=progressbar.AdaptiveETA)
|
|
|
|
# First pass, compute losses and transposed logits for decoding
|
|
for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
|
|
session.run(outputs['initialize_state'])
|
|
|
|
features = pad_to_dense(batch['features'].values)
|
|
features_len = batch['features_len'].values
|
|
labels = pad_to_dense(batch['transcript'].values + 1)
|
|
label_lengths = batch['transcript_len'].values
|
|
|
|
logits, loss_ = session.run([transposed, loss], feed_dict={
|
|
inputs['input']: features,
|
|
inputs['input_lengths']: features_len,
|
|
labels_ph: labels,
|
|
label_lengths_ph: label_lengths
|
|
})
|
|
|
|
logitses.append(logits)
|
|
losses.extend(loss_)
|
|
|
|
ground_truths = []
|
|
predictions = []
|
|
|
|
print('Decoding predictions...')
|
|
bar = progressbar.ProgressBar(max_value=batch_count,
|
|
widget=progressbar.AdaptiveETA)
|
|
|
|
# Get number of accessible CPU cores for this process
|
|
try:
|
|
num_processes = cpu_count()
|
|
except:
|
|
num_processes = 1
|
|
|
|
# Second pass, decode logits and compute WER and edit distance metrics
|
|
for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
|
|
seq_lengths = batch['features_len'].values.astype(np.int32)
|
|
decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, Config.alphabet, FLAGS.beam_width,
|
|
num_processes=num_processes, scorer=scorer)
|
|
|
|
ground_truths.extend(Config.alphabet.decode(l) for l in batch['transcript'])
|
|
predictions.extend(d[0][1] for d in decoded)
|
|
|
|
distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
|
|
|
|
wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
|
|
mean_loss = np.mean(losses)
|
|
|
|
# Take only the first report_count items
|
|
report_samples = itertools.islice(samples, FLAGS.report_count)
|
|
|
|
print('Test - WER: %f, CER: %f, loss: %f' %
|
|
(wer, cer, mean_loss))
|
|
print('-' * 80)
|
|
for sample in report_samples:
|
|
print('WER: %f, CER: %f, loss: %f' %
|
|
(sample.wer, sample.distance, sample.loss))
|
|
print(' - src: "%s"' % sample.src)
|
|
print(' - res: "%s"' % sample.res)
|
|
print('-' * 80)
|
|
|
|
return samples
|
|
|
|
|
|
def main(_):
|
|
initialize_globals()
|
|
|
|
if not FLAGS.test_files:
|
|
log_error('You need to specify what files to use for evaluation via '
|
|
'the --test_files flag.')
|
|
exit(1)
|
|
|
|
# sort examples by length, improves packing of batches and timesteps
|
|
test_data = preprocess(
|
|
FLAGS.test_files.split(','),
|
|
FLAGS.test_batch_size,
|
|
alphabet=Config.alphabet,
|
|
numcep=Config.n_input,
|
|
numcontext=Config.n_context,
|
|
hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
|
|
by="features_len",
|
|
ascending=False)
|
|
|
|
from DeepSpeech import create_inference_graph
|
|
graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)
|
|
|
|
samples = evaluate(test_data, graph)
|
|
|
|
if FLAGS.test_output_file:
|
|
# Save decoded tuples as JSON, converting NumPy floats to Python floats
|
|
json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
create_flags()
|
|
tf.app.flags.DEFINE_string('hdf5_test_set', '', 'path to hdf5 file to cache test set features')
|
|
tf.app.flags.DEFINE_string('test_output_file', '', 'path to a file to save all src/decoded/distance/loss tuples')
|
|
tf.app.run(main)
|