mirror of
https://github.com/mozilla/DeepSpeech.git
synced 2025-10-26 11:19:39 +00:00
Address review comments and do further filtering and cleanup on the transcription data
This commit is contained in:
parent
c7eaf9939b
commit
d8aaffce62
@ -13,7 +13,7 @@ from Queue import Queue
|
||||
from threading import Thread
|
||||
from util.audio import audiofile_to_input_vector
|
||||
from util.gpu import get_available_gpus
|
||||
from util.text import texts_to_sparse_tensor
|
||||
from util.text import texts_to_sparse_tensor, validate_label
|
||||
|
||||
class DataSets(object):
|
||||
def __init__(self, train, dev, test):
|
||||
@ -69,25 +69,30 @@ class DataSet(object):
|
||||
|
||||
def _populate_batch_queue(self):
|
||||
with self._graph.as_default():
|
||||
while True:
|
||||
n_steps = 0
|
||||
sources = []
|
||||
targets = []
|
||||
for index, (txt_file, wav_file) in enumerate(self._files_circular_list):
|
||||
if index >= self._batch_size:
|
||||
break
|
||||
next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
|
||||
if n_steps < next_source.shape[0]:
|
||||
n_steps = next_source.shape[0]
|
||||
sources.append(next_source)
|
||||
with open(txt_file) as open_txt_file:
|
||||
targets.append(open_txt_file.read())
|
||||
target = texts_to_sparse_tensor(targets)
|
||||
for index, next_source in enumerate(sources):
|
||||
npad = ((0,(n_steps - next_source.shape[0])), (0,0))
|
||||
sources[index] = np.pad(next_source, pad_width=npad, mode="constant")
|
||||
source = np.array(sources)
|
||||
self._batch_queue.put((source, target))
|
||||
n_steps = 0
|
||||
sources = []
|
||||
targets = []
|
||||
batch_index = 0
|
||||
for txt_file, wav_file in self._files_circular_list:
|
||||
if batch_index == self._batch_size:
|
||||
# Put batch on queue
|
||||
target = texts_to_sparse_tensor(targets)
|
||||
for index, next_source in enumerate(sources):
|
||||
npad = ((0,(n_steps - next_source.shape[0])), (0,0))
|
||||
sources[index] = np.pad(next_source, pad_width=npad, mode='constant')
|
||||
source = np.array(sources)
|
||||
self._batch_queue.put((source, target))
|
||||
n_steps = 0
|
||||
sources = []
|
||||
targets = []
|
||||
batch_index = 0
|
||||
next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
|
||||
if n_steps < next_source.shape[0]:
|
||||
n_steps = next_source.shape[0]
|
||||
sources.append(next_source)
|
||||
with open(txt_file) as open_txt_file:
|
||||
targets.append(open_txt_file.read())
|
||||
batch_index = batch_index + 1
|
||||
|
||||
def next_batch(self):
|
||||
source, target = self._batch_queue.get()
|
||||
@ -106,25 +111,25 @@ def read_data_sets(graph, data_dir, batch_size, numcep, numcontext, thread_count
|
||||
_maybe_convert_wav(data_dir, "LDC2005S13", "fisher-2005-wav")
|
||||
|
||||
# Conditionally split Fisher wav data
|
||||
_maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
|
||||
_maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
|
||||
_maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav", "fisher-2004-split-wav")
|
||||
_maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav", "fisher-2005-split-wav")
|
||||
|
||||
# Conditionally split Fisher transcriptions
|
||||
_maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
|
||||
_maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
|
||||
_maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-split-wav")
|
||||
_maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-split-wav")
|
||||
|
||||
# Conditionally split Fisher data into train/validation/test sets
|
||||
_maybe_split_sets(data_dir, "fisher-2004-wav", "fisher-2004-wav-splits")
|
||||
_maybe_split_sets(data_dir, "fisher-2005-wav", "fisher-2005-wav-splits")
|
||||
|
||||
_maybe_split_sets(data_dir, "fisher-2004-split-wav", "fisher-2004-split-wav-sets")
|
||||
_maybe_split_sets(data_dir, "fisher-2005-split-wav", "fisher-2005-split-wav-sets")
|
||||
|
||||
# Create train DataSet
|
||||
train = _read_data_set(graph, data_dir, "fisher-200?-wav/train", thread_count, batch_size, numcep, numcontext)
|
||||
train = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/train", thread_count, batch_size, numcep, numcontext)
|
||||
|
||||
# Create dev DataSet
|
||||
dev = _read_data_set(graph, data_dir, "fisher-200?-wav/dev", thread_count, batch_size, numcep, numcontext)
|
||||
dev = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/dev", thread_count, batch_size, numcep, numcontext)
|
||||
|
||||
# Create test DataSet
|
||||
test = _read_data_set(graph, data_dir, "fisher-200?-wav/test", thread_count, batch_size, numcep, numcontext)
|
||||
test = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/test", thread_count, batch_size, numcep, numcontext)
|
||||
|
||||
# Return DataSets
|
||||
return DataSets(train, dev, test)
|
||||
@ -134,18 +139,21 @@ def _maybe_convert_wav(data_dir, original_data, converted_data):
|
||||
target_dir = os.path.join(data_dir, converted_data)
|
||||
|
||||
# Conditionally convert sph files to wav files
|
||||
if not os.path.exists(target_dir):
|
||||
# Create target_dir
|
||||
os.makedirs(target_dir)
|
||||
|
||||
# Loop over sph files in source_dir and convert each to 16-bit PCM wav
|
||||
for root, dirnames, filenames in os.walk(source_dir):
|
||||
for filename in fnmatch.filter(filenames, "*.sph"):
|
||||
sph_file = os.path.join(root, filename)
|
||||
wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
|
||||
wav_file = os.path.join(target_dir, wav_filename)
|
||||
print("converting {} to {}".format(sph_file, wav_file))
|
||||
subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
|
||||
if os.path.exists(target_dir):
|
||||
print("skipping maybe_convert_wav")
|
||||
return
|
||||
|
||||
# Create target_dir
|
||||
os.makedirs(target_dir)
|
||||
|
||||
# Loop over sph files in source_dir and convert each to 16-bit PCM wav
|
||||
for root, dirnames, filenames in os.walk(source_dir):
|
||||
for filename in fnmatch.filter(filenames, "*.sph"):
|
||||
sph_file = os.path.join(root, filename)
|
||||
wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
|
||||
wav_file = os.path.join(target_dir, wav_filename)
|
||||
print("converting {} to {}".format(sph_file, wav_file))
|
||||
subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
|
||||
|
||||
def _parse_transcriptions(trans_file):
|
||||
segments = []
|
||||
@ -174,23 +182,26 @@ def _parse_transcriptions(trans_file):
|
||||
})
|
||||
return segments
|
||||
|
||||
def _maybe_split_wav(data_dir, original_data, converted_data):
|
||||
def _maybe_split_wav(data_dir, trans_data, original_data, converted_data):
|
||||
trans_dir = os.path.join(data_dir, trans_data)
|
||||
source_dir = os.path.join(data_dir, original_data)
|
||||
target_dir = os.path.join(data_dir, converted_data)
|
||||
|
||||
if os.path.exists(target_dir):
|
||||
print("skipping split_wav")
|
||||
print("skipping maybe_split_wav")
|
||||
return
|
||||
|
||||
os.makedirs(target_dir)
|
||||
|
||||
# Loop over transcription files and split corresponding wav
|
||||
for root, dirnames, filenames in os.walk(source_dir):
|
||||
for root, dirnames, filenames in os.walk(trans_dir):
|
||||
for filename in fnmatch.filter(filenames, "*.txt"):
|
||||
trans_file = os.path.join(root, filename)
|
||||
segments = _parse_transcriptions(trans_file)
|
||||
|
||||
# Open wav corresponding to transcription file
|
||||
wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + ".wav"
|
||||
wav_file = os.path.join(target_dir, wav_filename)
|
||||
wav_file = os.path.join(source_dir, wav_filename)
|
||||
|
||||
print("splitting {} according to {}".format(wav_file, trans_file))
|
||||
|
||||
@ -212,7 +223,7 @@ def _maybe_split_wav(data_dir, original_data, converted_data):
|
||||
origAudio.close()
|
||||
|
||||
# Remove wav_file
|
||||
os.remove(wav_file)
|
||||
# os.remove(wav_file)
|
||||
|
||||
def _split_wav(origAudio, start_time, stop_time, new_wav_file):
|
||||
frameRate = origAudio.getframerate()
|
||||
@ -229,8 +240,8 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
|
||||
source_dir = os.path.join(data_dir, original_data)
|
||||
target_dir = os.path.join(data_dir, converted_data)
|
||||
|
||||
if os.path.exists(target_dir):
|
||||
print("skipping split_transcriptions")
|
||||
if os.path.exists(os.path.join(source_dir, "split_transcriptions_done")):
|
||||
print("skipping maybe_split_transcriptions")
|
||||
return
|
||||
|
||||
# Loop over transcription files and split them into individual files for
|
||||
@ -247,19 +258,21 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
|
||||
txt_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".txt"
|
||||
txt_file = os.path.join(target_dir, txt_filename)
|
||||
|
||||
# If the txt segment filename does not exist create it
|
||||
if not os.path.exists(txt_file):
|
||||
transcript = validate_label(segment["transcript"])
|
||||
|
||||
# If the transcript is valid and the txt segment filename does
|
||||
# not exist create it
|
||||
if transcript != None and not os.path.exists(txt_file):
|
||||
with open(txt_file, "w") as fout:
|
||||
fout.write(segment["transcript"])
|
||||
|
||||
fout.write(transcript)
|
||||
|
||||
with open(os.path.join(source_dir, "split_transcriptions_done"), "w") as fout:
|
||||
fout.write("This file signals to the importer than the transcription of this source dir has already been completed.")
|
||||
|
||||
def _maybe_split_sets(data_dir, original_data, converted_data):
|
||||
source_dir = os.path.join(data_dir, original_data)
|
||||
target_dir = os.path.join(data_dir, converted_data)
|
||||
|
||||
if os.path.exists(target_dir):
|
||||
print("skipping split_sets")
|
||||
return
|
||||
|
||||
filelist = sorted(glob(os.path.join(source_dir, "*.txt")))
|
||||
|
||||
# We initially split the entire set into 80% train and 20% test, then
|
||||
|
||||
Loading…
Reference in New Issue
Block a user