From c0bb34cfd337054bcd40bf0caaa762886777a9a3 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Tue, 8 Nov 2016 03:04:58 -0200 Subject: [PATCH] Undo extraction of shared DataSets code from importers --- util/datasets.py | 61 --------------------------------- util/importers/ldc93s1.py | 70 ++++++++++++++++++++++++++++++++++---- util/importers/librivox.py | 66 +++++++++++++++++++++++++++++++---- util/importers/ted.py | 67 ++++++++++++++++++++++++++++++++---- 4 files changed, 183 insertions(+), 81 deletions(-) delete mode 100644 util/datasets.py diff --git a/util/datasets.py b/util/datasets.py deleted file mode 100644 index 028cb73a..00000000 --- a/util/datasets.py +++ /dev/null @@ -1,61 +0,0 @@ -import tensorflow as tf - -from math import ceil -from threading import Thread -from util.gpu import get_available_gpus -from util.text import ctc_label_dense_to_sparse - -class DataSets(object): - def __init__(self, train, dev, test): - self._dev = dev - self._test = test - self._train = train - - @property - def train(self): - return self._train - - @property - def dev(self): - return self._dev - - @property - def test(self): - return self._test - -class BaseDataSet(object): - def __init__(self, session, txt_files, thread_count, batch_size, num_mfcc_features, num_context): - self._session = session - self._num_mfcc_features = num_mfcc_features - self._x = tf.placeholder(tf.float32, [None, num_mfcc_features + (2 * num_mfcc_features * num_context)]) - self._x_length = tf.placeholder(tf.int32, []) - self._y = tf.placeholder(tf.int32, [None,]) - self._y_length = tf.placeholder(tf.int32, []) - self._example_queue = tf.PaddingFIFOQueue(shapes=[[None, num_mfcc_features + (2 * num_mfcc_features * num_context)], [], [None,], []], - dtypes=[tf.float32, tf.int32, tf.int32, tf.int32], - capacity=2 * self._get_device_count() * batch_size) - self._enqueue_op = self._example_queue.enqueue([self._x, self._x_length, self._y, self._y_length]) - self._txt_files = txt_files - self._batch_size = batch_size - self._num_context = num_context - self._thread_count = thread_count - - def _get_device_count(self): - available_gpus = get_available_gpus() - return max(len(available_gpus), 1) - - def _start_queue_threads(self): - batch_threads = [Thread(target=self._populate_batch_queue) for i in xrange(self._thread_count)] - for batch_thread in batch_threads: - batch_thread.daemon = True - batch_thread.start() - - def next_batch(self): - source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) - sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) - return source, source_lengths, sparse_labels - - @property - def total_batches(self): - # Note: If len(_txt_files) % _batch_size != 0, this re-uses initial _txt_files - return int(ceil(float(len(self._txt_files)) /float(self._batch_size))) \ No newline at end of file diff --git a/util/importers/ldc93s1.py b/util/importers/ldc93s1.py index bd06429c..e772c737 100644 --- a/util/importers/ldc93s1.py +++ b/util/importers/ldc93s1.py @@ -1,20 +1,65 @@ +import tensorflow as tf + from os import path from glob import glob -from util.datasets import BaseDataSet, DataSets -from util.text import text_to_char_array +from math import ceil +from threading import Thread +from util.gpu import get_available_gpus +from util.text import text_to_char_array, ctc_label_dense_to_sparse from util.audio import audiofile_to_input_vector from tensorflow.contrib.learn.python.learn.datasets import base -class DataSet(BaseDataSet): - def __init__(self, *args, **kwargs): - super(DataSet, self).__init__(*args, **kwargs) +class DataSets(object): + def __init__(self, train, dev, test): + self._dev = dev + self._test = test + self._train = train + + @property + def train(self): + return self._train + + @property + def dev(self): + return self._dev + + @property + def test(self): + return self._test + +class DataSet(object): + def __init__(self, session, txt_files, thread_count, batch_size, numcep, numcontext): + self._session = session + self._numcep = numcep + self._x = tf.placeholder(tf.float32, [None, numcep + (2 * numcep * numcontext)]) + self._x_length = tf.placeholder(tf.int32, []) + self._y = tf.placeholder(tf.int32, [None,]) + self._y_length = tf.placeholder(tf.int32, []) + self._example_queue = tf.PaddingFIFOQueue(shapes=[[None, numcep + (2 * numcep * numcontext)], [], [None,], []], + dtypes=[tf.float32, tf.int32, tf.int32, tf.int32], + capacity=2 * self._get_device_count() * batch_size) + self._enqueue_op = self._example_queue.enqueue([self._x, self._x_length, self._y, self._y_length]) + self._txt_files = txt_files + self._batch_size = batch_size + self._numcontext = numcontext + self._thread_count = thread_count self._start_queue_threads() - + + def _get_device_count(self): + available_gpus = get_available_gpus() + return max(len(available_gpus), 1) + + def _start_queue_threads(self): + batch_threads = [Thread(target=self._populate_batch_queue) for i in xrange(self._thread_count)] + for batch_thread in batch_threads: + batch_thread.daemon = True + batch_thread.start() + def _compute_source_target(self): txt_file = self._txt_files[0] wav_file = path.splitext(txt_file)[0] + ".wav" - audio_waves = audiofile_to_input_vector(wav_file, self._num_mfcc_features, self._num_context) + audio_waves = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) with open(txt_file) as open_txt_file: original = ' '.join(open_txt_file.read().strip().lower().split(' ')[2:]).replace('.', '') @@ -32,6 +77,17 @@ class DataSet(BaseDataSet): self._y: target, self._y_length: target_len}) + def next_batch(self): + source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) + sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) + return source, source_lengths, sparse_labels + + @property + def total_batches(self): + # Note: If len(_txt_files) % _batch_size != 0, this re-uses initial _txt_files + return int(ceil(float(len(self._txt_files)) /float(self._batch_size))) + + def read_data_sets(session, data_dir, batch_size, numcep, numcontext, thread_count=1): # Conditionally download data LDC93S1_BASE = "LDC93S1" diff --git a/util/importers/librivox.py b/util/importers/librivox.py index 6a51ed57..6f395888 100644 --- a/util/importers/librivox.py +++ b/util/importers/librivox.py @@ -3,6 +3,7 @@ import os import random import subprocess import tarfile +import tensorflow as tf from glob import glob from itertools import cycle @@ -12,16 +13,58 @@ from Queue import PriorityQueue from shutil import rmtree from tensorflow.contrib.learn.python.learn.datasets import base from tensorflow.python.platform import gfile +from threading import Thread from util.audio import audiofile_to_input_vector -from util.datasets import BaseDataSet, DataSets -from util.text import text_to_char_array +from util.gpu import get_available_gpus +from util.text import text_to_char_array, ctc_label_dense_to_sparse -class DataSet(BaseDataSet): - def __init__(self, *args, **kwargs): - super(DataSet, self).__init__(*args, **kwargs) +class DataSets(object): + def __init__(self, train, dev, test): + self._dev = dev + self._test = test + self._train = train + + @property + def train(self): + return self._train + + @property + def dev(self): + return self._dev + + @property + def test(self): + return self._test + +class DataSet(object): + def __init__(self, session, txt_files, thread_count, batch_size, numcep, numcontext): + self._session = session + self._numcep = numcep + self._x = tf.placeholder(tf.float32, [None, numcep + (2 * numcep * numcontext)]) + self._x_length = tf.placeholder(tf.int32, []) + self._y = tf.placeholder(tf.int32, [None,]) + self._y_length = tf.placeholder(tf.int32, []) + self._example_queue = tf.PaddingFIFOQueue(shapes=[[None, numcep + (2 * numcep * numcontext)], [], [None,], []], + dtypes=[tf.float32, tf.int32, tf.int32, tf.int32], + capacity=2 * self._get_device_count() * batch_size) + self._enqueue_op = self._example_queue.enqueue([self._x, self._x_length, self._y, self._y_length]) + self._txt_files = txt_files + self._batch_size = batch_size + self._numcontext = numcontext + self._thread_count = thread_count self._files_circular_list = self._create_files_circular_list() self._start_queue_threads() + def _get_device_count(self): + available_gpus = get_available_gpus() + return max(len(available_gpus), 1) + + def _start_queue_threads(self): + batch_threads = [Thread(target=self._populate_batch_queue) for i in xrange(self._thread_count)] + for batch_thread in batch_threads: + batch_thread.daemon = True + batch_thread.start() + def _create_files_circular_list(self): priorityQueue = PriorityQueue() for txt_file in self._txt_files: @@ -36,7 +79,7 @@ class DataSet(BaseDataSet): def _populate_batch_queue(self): for txt_file, wav_file in self._files_circular_list: - source = audiofile_to_input_vector(wav_file, self._num_mfcc_features, self._num_context) + source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) source_len = len(next_source) with open(txt_file) as open_txt_file: target = text_to_char_array(open_txt_file.read()) @@ -47,6 +90,17 @@ class DataSet(BaseDataSet): self._y: target, self._y_length: target_len}) + def next_batch(self): + source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) + sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) + return source, source_lengths, sparse_labels + + @property + def total_batches(self): + # Note: If len(_txt_files) % _batch_size != 0, this re-uses initial _txt_files + return int(ceil(float(len(self._txt_files)) /float(self._batch_size))) + + def read_data_sets(session, data_dir, batch_size, numcep, numcontext, thread_count=8): # Check if we can convert FLAC with SoX before we start sox_help_out = subprocess.check_output(["sox", "-h"]) diff --git a/util/importers/ted.py b/util/importers/ted.py index f7cc139f..c56cba69 100644 --- a/util/importers/ted.py +++ b/util/importers/ted.py @@ -3,6 +3,7 @@ import random import tarfile import threading import numpy as np +import tensorflow as tf from os import path from os import rmdir @@ -16,19 +17,60 @@ from itertools import cycle from os.path import getsize from threading import Thread from Queue import PriorityQueue -from util.datasets import BaseDataSet, DataSets from util.stm import parse_stm_file -from util.text import text_to_char_array +from util.gpu import get_available_gpus +from util.text import text_to_char_array, ctc_label_dense_to_sparse from tensorflow.python.platform import gfile from util.audio import audiofile_to_input_vector from tensorflow.contrib.learn.python.learn.datasets import base -class DataSet(BaseDataSet): - def __init__(self, *args, **kwargs): - super(DataSet, self).__init__(*args, **kwargs) +class DataSets(object): + def __init__(self, train, dev, test): + self._dev = dev + self._test = test + self._train = train + + @property + def train(self): + return self._train + + @property + def dev(self): + return self._dev + + @property + def test(self): + return self._test + +class DataSet(object): + def __init__(self, session, txt_files, thread_count, batch_size, numcep, numcontext): + self._session = session + self._numcep = numcep + self._x = tf.placeholder(tf.float32, [None, numcep + (2 * numcep * numcontext)]) + self._x_length = tf.placeholder(tf.int32, []) + self._y = tf.placeholder(tf.int32, [None,]) + self._y_length = tf.placeholder(tf.int32, []) + self._example_queue = tf.PaddingFIFOQueue(shapes=[[None, numcep + (2 * numcep * numcontext)], [], [None,], []], + dtypes=[tf.float32, tf.int32, tf.int32, tf.int32], + capacity=2 * self._get_device_count() * batch_size) + self._enqueue_op = self._example_queue.enqueue([self._x, self._x_length, self._y, self._y_length]) + self._txt_files = txt_files + self._batch_size = batch_size + self._numcontext = numcontext + self._thread_count = thread_count self._files_circular_list = self._create_files_circular_list() self._start_queue_threads() - + + def _get_device_count(self): + available_gpus = get_available_gpus() + return max(len(available_gpus), 1) + + def _start_queue_threads(self): + batch_threads = [Thread(target=self._populate_batch_queue) for i in xrange(self._thread_count)] + for batch_thread in batch_threads: + batch_thread.daemon = True + batch_thread.start() + def _create_files_circular_list(self): priorityQueue = PriorityQueue() for txt_file in self._txt_files: @@ -45,7 +87,7 @@ class DataSet(BaseDataSet): def _populate_batch_queue(self): for txt_file, wav_file in self._files_circular_list: - source = audiofile_to_input_vector(wav_file, self._num_mfcc_features, self._num_context) + source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) source_len = len(source) with open(txt_file) as open_txt_file: target = text_to_char_array(open_txt_file.read()) @@ -56,6 +98,17 @@ class DataSet(BaseDataSet): self._y: target, self._y_length: target_len}) + def next_batch(self): + source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(self._batch_size) + sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) + return source, source_lengths, sparse_labels + + @property + def total_batches(self): + # Note: If len(_txt_files) % _batch_size != 0, this re-uses initial _txt_files + return int(ceil(float(len(self._txt_files)) /float(self._batch_size))) + + def read_data_sets(session, data_dir, batch_size, numcep, numcontext, thread_count=8): # Conditionally download data TED_DATA = "TEDLIUM_release2.tar.gz"