From 77100ed1dfdeae4b2de545474f478016b9e8aca7 Mon Sep 17 00:00:00 2001 From: Chris Lord Date: Tue, 15 Nov 2016 14:32:51 +0000 Subject: [PATCH] Don't duplicate graph to do validation --- DeepSpeech.ipynb | 71 ++++++++++++++++++++------------------ util/importers/ldc93s1.py | 3 +- util/importers/librivox.py | 3 +- util/importers/ted.py | 3 +- 4 files changed, 41 insertions(+), 39 deletions(-) diff --git a/DeepSpeech.ipynb b/DeepSpeech.ipynb index bb0cadbc..d44df3b0 100644 --- a/DeepSpeech.ipynb +++ b/DeepSpeech.ipynb @@ -91,7 +91,7 @@ "from util.log import merge_logs\n", "from util.gpu import get_available_gpus\n", "from util.shared_lib import check_cupti\n", - "from util.text import sparse_tensor_value_to_texts, wers\n", + "from util.text import ctc_label_dense_to_sparse, sparse_tensor_value_to_texts, wers\n", "from tensorflow.python.ops import ctc_ops\n", "from tensorflow.contrib.session_bundle import exporter\n", "\n", @@ -169,7 +169,9 @@ "source": [ "Note that we use the Adam optimizer[[3]](http://arxiv.org/abs/1412.6980) instead of Nesterov’s Accelerated Gradient [[4]](http://www.cs.utoronto.ca/~ilya/pubs/2013/1051_2.pdf) used in the original DeepSpeech paper, as, at the time of writing, TensorFlow does not have an implementation of Nesterov’s Accelerated Gradient [[4]](http://www.cs.utoronto.ca/~ilya/pubs/2013/1051_2.pdf).\n", "\n", - "As we will also employ dropout on the feedforward layers of the network, we need to define a parameter `dropout_rate` that keeps track of the dropout rate for these layers" + "As we will also employ dropout on the feedforward layers of the network, we need to define a parameter `dropout_rate` that keeps track of the dropout rate for these layers.\n", + "\n", + "To avoid graph duplication when performing validation steps, we use a placeholder and an alternative feed_dict during validation to pull data from the corresponding queue." ] }, { @@ -185,11 +187,20 @@ "# This global placeholder will be used for all dropout definitions\n", "dropout_rate_placeholder = tf.placeholder(tf.float32)\n", "\n", - "# The feed_dict used for training employs the given dropout_rate\n", - "feed_dict_train = { dropout_rate_placeholder: dropout_rate }\n", + "# This placeholder will be used to select between queues\n", + "queue_selector_placeholder = tf.placeholder(tf.uint8)\n", "\n", - "# While the feed_dict used for validation, test and train progress reporting employs zero dropout\n", - "feed_dict = { dropout_rate_placeholder: 0.0 }" + "# The feed_dict used for training employs the given dropout_rate\n", + "feed_dict_train = { dropout_rate_placeholder: dropout_rate,\n", + " queue_selector_placeholder: 0 }\n", + "\n", + "# The feed dict used for validation employs zero dropout and selects the validation queue\n", + "feed_dict_validate = { dropout_rate_placeholder: 0.0,\n", + " queue_selector_placeholder: 1 }\n", + "\n", + "# While the feed_dict used for test reporting employs zero dropout\n", + "feed_dict_test = { dropout_rate_placeholder: 0.0,\n", + " queue_selector_placeholder: 0 }" ] }, { @@ -644,10 +655,7 @@ }, "outputs": [], "source": [ - "def calculate_accuracy_and_loss(batch_set):\n", - " # Obtain the next batch of data\n", - " batch_x, batch_seq_len, batch_y = batch_set.next_batch()\n", - "\n", + "def calculate_accuracy_and_loss(batch_x, batch_seq_len, batch_y):\n", " # Calculate the logits of the batch using BiRNN\n", " logits = BiRNN(batch_x, tf.to_int64(batch_seq_len))\n", " \n", @@ -677,15 +685,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The first lines of `calculate_accuracy_and_loss()`\n", - "```python\n", - "def calculate_accuracy_and_loss(batch_set):\n", - " # Obtain the next batch of data\n", - " batch_x, batch_seq_len, batch_y = batch_set.next_batch()\n", - "```\n", - "simply obtain the next mini-batch of data.\n", - "\n", - "The next line\n", + "The first line of `calculate_accuracy_and_loss()`\n", "```python\n", " # Calculate the logits from the BiRNN\n", " logits = BiRNN(batch_x, batch_seq_len)\n", @@ -912,7 +912,7 @@ }, "outputs": [], "source": [ - "def get_tower_results(batch_set, optimizer=None):\n", + "def get_tower_results(batch_sets, optimizer=None):\n", " # Tower decodings to return\n", " tower_decodings = []\n", " # Tower labels to return\n", @@ -932,9 +932,18 @@ " with tf.device(available_devices[i]):\n", " # Create a scope for all operations of tower i\n", " with tf.name_scope('tower_%d' % i) as scope:\n", + " # Fetch the next batch of data\n", + " batch_x, batch_x_seq_len, batch_y, batch_y_seq_len = \\\n", + " tf.cond(tf.less(queue_selector_placeholder, 1),\n", + " lambda: batch_sets[0].next_batch(),\n", + " lambda: batch_sets[1].next_batch())\n", + " \n", " # Calculate the avg_loss and accuracy and retrieve the decoded \n", " # batch along with the original batch's labels (Y) of this tower\n", - " total_loss, avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(batch_set)\n", + " batch_y = ctc_label_dense_to_sparse(batch_y, batch_y_seq_len, batch_size)\n", + " total_loss, avg_loss, accuracy, decoded, labels = calculate_accuracy_and_loss(batch_x,\n", + " batch_x_seq_len,\n", + " batch_y)\n", " \n", " # Allow for variables to be re-used by the next tower\n", " tf.get_variable_scope().reuse_variables()\n", @@ -1155,7 +1164,7 @@ "source": [ "def get_results_params(data_set):\n", " # Get tower results\n", - " tower_decodings, tower_labels, _, tower_total_losses, _, _ = get_tower_results(data_set)\n", + " tower_decodings, tower_labels, _, tower_total_losses, _, _ = get_tower_results([data_set, data_set])\n", " # Join the individual results tensors into a results_params tuple\n", " return (tower_labels, tower_decodings, tower_total_losses)\n", " " @@ -1268,7 +1277,7 @@ }, "outputs": [], "source": [ - "def run_inference(session, caption, data_set, results_params=None):\n", + "def run_inference(session, caption, data_set, feed_dict, results_params=None):\n", " if results_params is None:\n", " # Get the data_set specific graph end-points\n", " results_params = get_results_params(data_set)\n", @@ -1322,10 +1331,7 @@ " tower_total_losses, \\\n", " tower_avg_losses, \\\n", " avg_accuracy \\\n", - " = get_tower_results(data_sets.train, optimizer)\n", - " \n", - " # Validation results parameters\n", - " dev_results_params = get_results_params(data_sets.dev)\n", + " = get_tower_results([data_sets.train, data_sets.dev], optimizer)\n", " \n", " # Average tower gradients\n", " avg_tower_gradients = average_gradients(tower_gradients)\n", @@ -1348,8 +1354,8 @@ " # Start importer's queue threads\n", " data_sets.start_queue_threads(session)\n", " \n", - " # Training results parameters\n", - " train_results_params = (tower_labels, tower_decodings, tower_total_losses)\n", + " # Result parameters\n", + " results_params = (tower_labels, tower_decodings, tower_total_losses)\n", " \n", " # Prepare tensor board logging\n", " merged = tf.merge_all_summaries()\n", @@ -1382,7 +1388,7 @@ " # Create training results tuple\n", " train_results = ([],[],[])\n", " # Extend the session.run parameters\n", - " params.append(train_results_params)\n", + " params.append(results_params)\n", "\n", " # Loop over the batches\n", " for batch in range(int(ceil(batches_per_device))):\n", @@ -1417,8 +1423,7 @@ " \n", " # Validation step\n", " if epoch % validation_step == 0:\n", - " dev_wer = run_inference(session, \"Validation\", data_sets.dev, results_params=dev_results_params)\n", - " \n", + " dev_wer = run_inference(session, \"Validation\", data_sets.dev, feed_dict_validate, results_params)\n", "\n", " # Checkpoint the model\n", " if (epoch % checkpoint_step == 0) or (epoch == training_iters - 1):\n", @@ -1469,7 +1474,7 @@ " duration = duration.days * 86400 + duration.seconds\n", " \n", " # Finally the model is tested against some unbiased data-set\n", - " test_wer = run_inference(session, \"Test\", data_sets.test)" + " test_wer = run_inference(session, \"Test\", data_sets.test, feed_dict_test)" ] }, { diff --git a/util/importers/ldc93s1.py b/util/importers/ldc93s1.py index 66b24bad..bde0f1e7 100644 --- a/util/importers/ldc93s1.py +++ b/util/importers/ldc93s1.py @@ -82,8 +82,7 @@ class DataSet(object): def next_batch(self): source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) - sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) - return source, source_lengths, sparse_labels + return source, source_lengths, target, target_lengths @property def total_batches(self): diff --git a/util/importers/librivox.py b/util/importers/librivox.py index 8ce267b6..b1101b98 100644 --- a/util/importers/librivox.py +++ b/util/importers/librivox.py @@ -98,8 +98,7 @@ class DataSet(object): def next_batch(self): source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) - sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) - return source, source_lengths, sparse_labels + return source, source_lengths, target, target_lengths @property def total_batches(self): diff --git a/util/importers/ted.py b/util/importers/ted.py index 4fe9bfd2..05bb632d 100644 --- a/util/importers/ted.py +++ b/util/importers/ted.py @@ -106,8 +106,7 @@ class DataSet(object): def next_batch(self): source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) - sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) - return source, source_lengths, sparse_labels + return source, source_lengths, target, target_lengths @property def total_batches(self):