Address review comments and do further filtering and cleanup on the transcription data

2025-10-26 11:19:39 +00:00 · 2016-10-27 16:08:14 -02:00 · 2016-10-27 16:08:14 -02:00 · d8aaffce62
commit d8aaffce62
parent c7eaf9939b
1 changed files with 70 additions and 57 deletions
--- a/util/importers/fisher.py
+++ b/util/importers/fisher.py
@ -13,7 +13,7 @@ from Queue import Queue
 from threading import Thread
 from util.audio import audiofile_to_input_vector
 from util.gpu import get_available_gpus
-from util.text import texts_to_sparse_tensor
+from util.text import texts_to_sparse_tensor, validate_label

 class DataSets(object):
    def __init__(self, train, dev, test):
@ -69,25 +69,30 @@ class DataSet(object):
    
    def _populate_batch_queue(self):
        with self._graph.as_default():
-            while True:
-                n_steps = 0
-                sources = []
-                targets = []
-                for index, (txt_file, wav_file) in enumerate(self._files_circular_list):
-                    if index >= self._batch_size:
-                        break
-                    next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
-                    if n_steps < next_source.shape[0]:
-                        n_steps = next_source.shape[0]
-                    sources.append(next_source)
-                    with open(txt_file) as open_txt_file:
-                        targets.append(open_txt_file.read())
-                target = texts_to_sparse_tensor(targets)
-                for index, next_source in enumerate(sources):
-                    npad = ((0,(n_steps - next_source.shape[0])), (0,0))
-                    sources[index] = np.pad(next_source, pad_width=npad, mode="constant")
-                source = np.array(sources)
-                self._batch_queue.put((source, target))
+            n_steps = 0
+            sources = []
+            targets = []
+            batch_index = 0
+            for txt_file, wav_file in self._files_circular_list:
+                if batch_index == self._batch_size:
+                    # Put batch on queue
+                    target = texts_to_sparse_tensor(targets)
+                    for index, next_source in enumerate(sources):
+                        npad = ((0,(n_steps - next_source.shape[0])), (0,0))
+                        sources[index] = np.pad(next_source, pad_width=npad, mode='constant')
+                    source = np.array(sources)
+                    self._batch_queue.put((source, target))
+                    n_steps = 0
+                    sources = []
+                    targets = []
+                    batch_index = 0
+                next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
+                if n_steps < next_source.shape[0]:
+                    n_steps = next_source.shape[0]
+                sources.append(next_source)
+                with open(txt_file) as open_txt_file:
+                    targets.append(open_txt_file.read())
+                batch_index = batch_index + 1
    
    def next_batch(self):
        source, target = self._batch_queue.get()
@ -106,25 +111,25 @@ def read_data_sets(graph, data_dir, batch_size, numcep, numcontext, thread_count
    _maybe_convert_wav(data_dir, "LDC2005S13", "fisher-2005-wav")

    # Conditionally split Fisher wav data
-    _maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
-    _maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
+    _maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav", "fisher-2004-split-wav")
+    _maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav", "fisher-2005-split-wav")

    # Conditionally split Fisher transcriptions
-    _maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
-    _maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
+    _maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-split-wav")
+    _maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-split-wav")
    
    # Conditionally split Fisher data into train/validation/test sets
-    _maybe_split_sets(data_dir, "fisher-2004-wav", "fisher-2004-wav-splits")
-    _maybe_split_sets(data_dir, "fisher-2005-wav", "fisher-2005-wav-splits")
-
+    _maybe_split_sets(data_dir, "fisher-2004-split-wav", "fisher-2004-split-wav-sets")
+    _maybe_split_sets(data_dir, "fisher-2005-split-wav", "fisher-2005-split-wav-sets")
+    
    # Create train DataSet
-    train = _read_data_set(graph, data_dir, "fisher-200?-wav/train", thread_count, batch_size, numcep, numcontext)
+    train = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/train", thread_count, batch_size, numcep, numcontext)

    # Create dev DataSet
-    dev = _read_data_set(graph, data_dir, "fisher-200?-wav/dev", thread_count, batch_size, numcep, numcontext)
+    dev = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/dev", thread_count, batch_size, numcep, numcontext)

    # Create test DataSet
-    test = _read_data_set(graph, data_dir, "fisher-200?-wav/test", thread_count, batch_size, numcep, numcontext)
+    test = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/test", thread_count, batch_size, numcep, numcontext)

    # Return DataSets
    return DataSets(train, dev, test)
@ -134,18 +139,21 @@ def _maybe_convert_wav(data_dir, original_data, converted_data):
    target_dir = os.path.join(data_dir, converted_data)
    
    # Conditionally convert sph files to wav files
-    if not os.path.exists(target_dir):
-        # Create target_dir
-        os.makedirs(target_dir)
-        
-        # Loop over sph files in source_dir and convert each to 16-bit PCM wav
-        for root, dirnames, filenames in os.walk(source_dir):
-            for filename in fnmatch.filter(filenames, "*.sph"):
-                sph_file = os.path.join(root, filename)
-                wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
-                wav_file = os.path.join(target_dir, wav_filename)
-                print("converting {} to {}".format(sph_file, wav_file))
-                subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
+    if os.path.exists(target_dir):
+        print("skipping maybe_convert_wav")
+        return
+    
+    # Create target_dir
+    os.makedirs(target_dir)
+    
+    # Loop over sph files in source_dir and convert each to 16-bit PCM wav
+    for root, dirnames, filenames in os.walk(source_dir):
+        for filename in fnmatch.filter(filenames, "*.sph"):
+            sph_file = os.path.join(root, filename)
+            wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
+            wav_file = os.path.join(target_dir, wav_filename)
+            print("converting {} to {}".format(sph_file, wav_file))
+            subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])

 def _parse_transcriptions(trans_file):
    segments = []
@ -174,23 +182,26 @@ def _parse_transcriptions(trans_file):
            })
    return segments

-def _maybe_split_wav(data_dir, original_data, converted_data):
+def _maybe_split_wav(data_dir, trans_data, original_data, converted_data):
+    trans_dir = os.path.join(data_dir, trans_data)
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)
    
    if os.path.exists(target_dir):
-        print("skipping split_wav")
+        print("skipping maybe_split_wav")
        return
    
+    os.makedirs(target_dir)
+    
    # Loop over transcription files and split corresponding wav
-    for root, dirnames, filenames in os.walk(source_dir):
+    for root, dirnames, filenames in os.walk(trans_dir):
        for filename in fnmatch.filter(filenames, "*.txt"):
            trans_file = os.path.join(root, filename)
            segments = _parse_transcriptions(trans_file)
            
            # Open wav corresponding to transcription file
            wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + ".wav"
-            wav_file = os.path.join(target_dir, wav_filename)
+            wav_file = os.path.join(source_dir, wav_filename)
            
            print("splitting {} according to {}".format(wav_file, trans_file))
            
@ -212,7 +223,7 @@ def _maybe_split_wav(data_dir, original_data, converted_data):
            origAudio.close()
            
            # Remove wav_file
-            os.remove(wav_file)
+            # os.remove(wav_file)

 def _split_wav(origAudio, start_time, stop_time, new_wav_file):
    frameRate = origAudio.getframerate()
@ -229,8 +240,8 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)
    
-    if os.path.exists(target_dir):
-        print("skipping split_transcriptions")
+    if os.path.exists(os.path.join(source_dir, "split_transcriptions_done")):
+        print("skipping maybe_split_transcriptions")
        return
    
    # Loop over transcription files and split them into individual files for
@ -247,19 +258,21 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
                txt_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".txt"
                txt_file = os.path.join(target_dir, txt_filename)
                
-                # If the txt segment filename does not exist create it
-                if not os.path.exists(txt_file):
+                transcript = validate_label(segment["transcript"])
+                
+                # If the transcript is valid and the txt segment filename does
+                # not exist create it
+                if transcript != None and not os.path.exists(txt_file):
                    with open(txt_file, "w") as fout:
-                        fout.write(segment["transcript"])
-            
+                        fout.write(transcript)
+    
+    with open(os.path.join(source_dir, "split_transcriptions_done"), "w") as fout:
+        fout.write("This file signals to the importer than the transcription of this source dir has already been completed.")
+    
 def _maybe_split_sets(data_dir, original_data, converted_data):
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)
    
-    if os.path.exists(target_dir):
-        print("skipping split_sets")
-        return
-    
    filelist = sorted(glob(os.path.join(source_dir, "*.txt")))
    
    # We initially split the entire set into 80% train and 20% test, then