Address review comments and do further filtering and cleanup on the transcription data

This commit is contained in:
Reuben Morais 2016-10-27 16:08:14 -02:00
parent c7eaf9939b
commit d8aaffce62

View File

@ -13,7 +13,7 @@ from Queue import Queue
from threading import Thread
from util.audio import audiofile_to_input_vector
from util.gpu import get_available_gpus
from util.text import texts_to_sparse_tensor
from util.text import texts_to_sparse_tensor, validate_label
class DataSets(object):
def __init__(self, train, dev, test):
@ -69,25 +69,30 @@ class DataSet(object):
def _populate_batch_queue(self):
with self._graph.as_default():
while True:
n_steps = 0
sources = []
targets = []
for index, (txt_file, wav_file) in enumerate(self._files_circular_list):
if index >= self._batch_size:
break
next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
if n_steps < next_source.shape[0]:
n_steps = next_source.shape[0]
sources.append(next_source)
with open(txt_file) as open_txt_file:
targets.append(open_txt_file.read())
target = texts_to_sparse_tensor(targets)
for index, next_source in enumerate(sources):
npad = ((0,(n_steps - next_source.shape[0])), (0,0))
sources[index] = np.pad(next_source, pad_width=npad, mode="constant")
source = np.array(sources)
self._batch_queue.put((source, target))
n_steps = 0
sources = []
targets = []
batch_index = 0
for txt_file, wav_file in self._files_circular_list:
if batch_index == self._batch_size:
# Put batch on queue
target = texts_to_sparse_tensor(targets)
for index, next_source in enumerate(sources):
npad = ((0,(n_steps - next_source.shape[0])), (0,0))
sources[index] = np.pad(next_source, pad_width=npad, mode='constant')
source = np.array(sources)
self._batch_queue.put((source, target))
n_steps = 0
sources = []
targets = []
batch_index = 0
next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
if n_steps < next_source.shape[0]:
n_steps = next_source.shape[0]
sources.append(next_source)
with open(txt_file) as open_txt_file:
targets.append(open_txt_file.read())
batch_index = batch_index + 1
def next_batch(self):
source, target = self._batch_queue.get()
@ -106,25 +111,25 @@ def read_data_sets(graph, data_dir, batch_size, numcep, numcontext, thread_count
_maybe_convert_wav(data_dir, "LDC2005S13", "fisher-2005-wav")
# Conditionally split Fisher wav data
_maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
_maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
_maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav", "fisher-2004-split-wav")
_maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav", "fisher-2005-split-wav")
# Conditionally split Fisher transcriptions
_maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
_maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
_maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-split-wav")
_maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-split-wav")
# Conditionally split Fisher data into train/validation/test sets
_maybe_split_sets(data_dir, "fisher-2004-wav", "fisher-2004-wav-splits")
_maybe_split_sets(data_dir, "fisher-2005-wav", "fisher-2005-wav-splits")
_maybe_split_sets(data_dir, "fisher-2004-split-wav", "fisher-2004-split-wav-sets")
_maybe_split_sets(data_dir, "fisher-2005-split-wav", "fisher-2005-split-wav-sets")
# Create train DataSet
train = _read_data_set(graph, data_dir, "fisher-200?-wav/train", thread_count, batch_size, numcep, numcontext)
train = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/train", thread_count, batch_size, numcep, numcontext)
# Create dev DataSet
dev = _read_data_set(graph, data_dir, "fisher-200?-wav/dev", thread_count, batch_size, numcep, numcontext)
dev = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/dev", thread_count, batch_size, numcep, numcontext)
# Create test DataSet
test = _read_data_set(graph, data_dir, "fisher-200?-wav/test", thread_count, batch_size, numcep, numcontext)
test = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/test", thread_count, batch_size, numcep, numcontext)
# Return DataSets
return DataSets(train, dev, test)
@ -134,18 +139,21 @@ def _maybe_convert_wav(data_dir, original_data, converted_data):
target_dir = os.path.join(data_dir, converted_data)
# Conditionally convert sph files to wav files
if not os.path.exists(target_dir):
# Create target_dir
os.makedirs(target_dir)
# Loop over sph files in source_dir and convert each to 16-bit PCM wav
for root, dirnames, filenames in os.walk(source_dir):
for filename in fnmatch.filter(filenames, "*.sph"):
sph_file = os.path.join(root, filename)
wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
wav_file = os.path.join(target_dir, wav_filename)
print("converting {} to {}".format(sph_file, wav_file))
subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
if os.path.exists(target_dir):
print("skipping maybe_convert_wav")
return
# Create target_dir
os.makedirs(target_dir)
# Loop over sph files in source_dir and convert each to 16-bit PCM wav
for root, dirnames, filenames in os.walk(source_dir):
for filename in fnmatch.filter(filenames, "*.sph"):
sph_file = os.path.join(root, filename)
wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
wav_file = os.path.join(target_dir, wav_filename)
print("converting {} to {}".format(sph_file, wav_file))
subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
def _parse_transcriptions(trans_file):
segments = []
@ -174,23 +182,26 @@ def _parse_transcriptions(trans_file):
})
return segments
def _maybe_split_wav(data_dir, original_data, converted_data):
def _maybe_split_wav(data_dir, trans_data, original_data, converted_data):
trans_dir = os.path.join(data_dir, trans_data)
source_dir = os.path.join(data_dir, original_data)
target_dir = os.path.join(data_dir, converted_data)
if os.path.exists(target_dir):
print("skipping split_wav")
print("skipping maybe_split_wav")
return
os.makedirs(target_dir)
# Loop over transcription files and split corresponding wav
for root, dirnames, filenames in os.walk(source_dir):
for root, dirnames, filenames in os.walk(trans_dir):
for filename in fnmatch.filter(filenames, "*.txt"):
trans_file = os.path.join(root, filename)
segments = _parse_transcriptions(trans_file)
# Open wav corresponding to transcription file
wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + ".wav"
wav_file = os.path.join(target_dir, wav_filename)
wav_file = os.path.join(source_dir, wav_filename)
print("splitting {} according to {}".format(wav_file, trans_file))
@ -212,7 +223,7 @@ def _maybe_split_wav(data_dir, original_data, converted_data):
origAudio.close()
# Remove wav_file
os.remove(wav_file)
# os.remove(wav_file)
def _split_wav(origAudio, start_time, stop_time, new_wav_file):
frameRate = origAudio.getframerate()
@ -229,8 +240,8 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
source_dir = os.path.join(data_dir, original_data)
target_dir = os.path.join(data_dir, converted_data)
if os.path.exists(target_dir):
print("skipping split_transcriptions")
if os.path.exists(os.path.join(source_dir, "split_transcriptions_done")):
print("skipping maybe_split_transcriptions")
return
# Loop over transcription files and split them into individual files for
@ -247,19 +258,21 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
txt_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".txt"
txt_file = os.path.join(target_dir, txt_filename)
# If the txt segment filename does not exist create it
if not os.path.exists(txt_file):
transcript = validate_label(segment["transcript"])
# If the transcript is valid and the txt segment filename does
# not exist create it
if transcript != None and not os.path.exists(txt_file):
with open(txt_file, "w") as fout:
fout.write(segment["transcript"])
fout.write(transcript)
with open(os.path.join(source_dir, "split_transcriptions_done"), "w") as fout:
fout.write("This file signals to the importer than the transcription of this source dir has already been completed.")
def _maybe_split_sets(data_dir, original_data, converted_data):
source_dir = os.path.join(data_dir, original_data)
target_dir = os.path.join(data_dir, converted_data)
if os.path.exists(target_dir):
print("skipping split_sets")
return
filelist = sorted(glob(os.path.join(source_dir, "*.txt")))
# We initially split the entire set into 80% train and 20% test, then