From d8aaffce620c35b55c65e840ddb3a67f8c2fbd68 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 27 Oct 2016 16:08:14 -0200 Subject: [PATCH] Address review comments and do further filtering and cleanup on the transcription data --- util/importers/fisher.py | 127 +++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 57 deletions(-) diff --git a/util/importers/fisher.py b/util/importers/fisher.py index 683919c3..8cc4e701 100644 --- a/util/importers/fisher.py +++ b/util/importers/fisher.py @@ -13,7 +13,7 @@ from Queue import Queue from threading import Thread from util.audio import audiofile_to_input_vector from util.gpu import get_available_gpus -from util.text import texts_to_sparse_tensor +from util.text import texts_to_sparse_tensor, validate_label class DataSets(object): def __init__(self, train, dev, test): @@ -69,25 +69,30 @@ class DataSet(object): def _populate_batch_queue(self): with self._graph.as_default(): - while True: - n_steps = 0 - sources = [] - targets = [] - for index, (txt_file, wav_file) in enumerate(self._files_circular_list): - if index >= self._batch_size: - break - next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) - if n_steps < next_source.shape[0]: - n_steps = next_source.shape[0] - sources.append(next_source) - with open(txt_file) as open_txt_file: - targets.append(open_txt_file.read()) - target = texts_to_sparse_tensor(targets) - for index, next_source in enumerate(sources): - npad = ((0,(n_steps - next_source.shape[0])), (0,0)) - sources[index] = np.pad(next_source, pad_width=npad, mode="constant") - source = np.array(sources) - self._batch_queue.put((source, target)) + n_steps = 0 + sources = [] + targets = [] + batch_index = 0 + for txt_file, wav_file in self._files_circular_list: + if batch_index == self._batch_size: + # Put batch on queue + target = texts_to_sparse_tensor(targets) + for index, next_source in enumerate(sources): + npad = ((0,(n_steps - next_source.shape[0])), (0,0)) + sources[index] = np.pad(next_source, pad_width=npad, mode='constant') + source = np.array(sources) + self._batch_queue.put((source, target)) + n_steps = 0 + sources = [] + targets = [] + batch_index = 0 + next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) + if n_steps < next_source.shape[0]: + n_steps = next_source.shape[0] + sources.append(next_source) + with open(txt_file) as open_txt_file: + targets.append(open_txt_file.read()) + batch_index = batch_index + 1 def next_batch(self): source, target = self._batch_queue.get() @@ -106,25 +111,25 @@ def read_data_sets(graph, data_dir, batch_size, numcep, numcontext, thread_count _maybe_convert_wav(data_dir, "LDC2005S13", "fisher-2005-wav") # Conditionally split Fisher wav data - _maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav") - _maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav") + _maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav", "fisher-2004-split-wav") + _maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav", "fisher-2005-split-wav") # Conditionally split Fisher transcriptions - _maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav") - _maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav") + _maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-split-wav") + _maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-split-wav") # Conditionally split Fisher data into train/validation/test sets - _maybe_split_sets(data_dir, "fisher-2004-wav", "fisher-2004-wav-splits") - _maybe_split_sets(data_dir, "fisher-2005-wav", "fisher-2005-wav-splits") - + _maybe_split_sets(data_dir, "fisher-2004-split-wav", "fisher-2004-split-wav-sets") + _maybe_split_sets(data_dir, "fisher-2005-split-wav", "fisher-2005-split-wav-sets") + # Create train DataSet - train = _read_data_set(graph, data_dir, "fisher-200?-wav/train", thread_count, batch_size, numcep, numcontext) + train = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/train", thread_count, batch_size, numcep, numcontext) # Create dev DataSet - dev = _read_data_set(graph, data_dir, "fisher-200?-wav/dev", thread_count, batch_size, numcep, numcontext) + dev = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/dev", thread_count, batch_size, numcep, numcontext) # Create test DataSet - test = _read_data_set(graph, data_dir, "fisher-200?-wav/test", thread_count, batch_size, numcep, numcontext) + test = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/test", thread_count, batch_size, numcep, numcontext) # Return DataSets return DataSets(train, dev, test) @@ -134,18 +139,21 @@ def _maybe_convert_wav(data_dir, original_data, converted_data): target_dir = os.path.join(data_dir, converted_data) # Conditionally convert sph files to wav files - if not os.path.exists(target_dir): - # Create target_dir - os.makedirs(target_dir) - - # Loop over sph files in source_dir and convert each to 16-bit PCM wav - for root, dirnames, filenames in os.walk(source_dir): - for filename in fnmatch.filter(filenames, "*.sph"): - sph_file = os.path.join(root, filename) - wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav" - wav_file = os.path.join(target_dir, wav_filename) - print("converting {} to {}".format(sph_file, wav_file)) - subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file]) + if os.path.exists(target_dir): + print("skipping maybe_convert_wav") + return + + # Create target_dir + os.makedirs(target_dir) + + # Loop over sph files in source_dir and convert each to 16-bit PCM wav + for root, dirnames, filenames in os.walk(source_dir): + for filename in fnmatch.filter(filenames, "*.sph"): + sph_file = os.path.join(root, filename) + wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav" + wav_file = os.path.join(target_dir, wav_filename) + print("converting {} to {}".format(sph_file, wav_file)) + subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file]) def _parse_transcriptions(trans_file): segments = [] @@ -174,23 +182,26 @@ def _parse_transcriptions(trans_file): }) return segments -def _maybe_split_wav(data_dir, original_data, converted_data): +def _maybe_split_wav(data_dir, trans_data, original_data, converted_data): + trans_dir = os.path.join(data_dir, trans_data) source_dir = os.path.join(data_dir, original_data) target_dir = os.path.join(data_dir, converted_data) if os.path.exists(target_dir): - print("skipping split_wav") + print("skipping maybe_split_wav") return + os.makedirs(target_dir) + # Loop over transcription files and split corresponding wav - for root, dirnames, filenames in os.walk(source_dir): + for root, dirnames, filenames in os.walk(trans_dir): for filename in fnmatch.filter(filenames, "*.txt"): trans_file = os.path.join(root, filename) segments = _parse_transcriptions(trans_file) # Open wav corresponding to transcription file wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + ".wav" - wav_file = os.path.join(target_dir, wav_filename) + wav_file = os.path.join(source_dir, wav_filename) print("splitting {} according to {}".format(wav_file, trans_file)) @@ -212,7 +223,7 @@ def _maybe_split_wav(data_dir, original_data, converted_data): origAudio.close() # Remove wav_file - os.remove(wav_file) + # os.remove(wav_file) def _split_wav(origAudio, start_time, stop_time, new_wav_file): frameRate = origAudio.getframerate() @@ -229,8 +240,8 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data): source_dir = os.path.join(data_dir, original_data) target_dir = os.path.join(data_dir, converted_data) - if os.path.exists(target_dir): - print("skipping split_transcriptions") + if os.path.exists(os.path.join(source_dir, "split_transcriptions_done")): + print("skipping maybe_split_transcriptions") return # Loop over transcription files and split them into individual files for @@ -247,19 +258,21 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data): txt_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".txt" txt_file = os.path.join(target_dir, txt_filename) - # If the txt segment filename does not exist create it - if not os.path.exists(txt_file): + transcript = validate_label(segment["transcript"]) + + # If the transcript is valid and the txt segment filename does + # not exist create it + if transcript != None and not os.path.exists(txt_file): with open(txt_file, "w") as fout: - fout.write(segment["transcript"]) - + fout.write(transcript) + + with open(os.path.join(source_dir, "split_transcriptions_done"), "w") as fout: + fout.write("This file signals to the importer than the transcription of this source dir has already been completed.") + def _maybe_split_sets(data_dir, original_data, converted_data): source_dir = os.path.join(data_dir, original_data) target_dir = os.path.join(data_dir, converted_data) - if os.path.exists(target_dir): - print("skipping split_sets") - return - filelist = sorted(glob(os.path.join(source_dir, "*.txt"))) # We initially split the entire set into 80% train and 20% test, then