From d8aaffce620c35b55c65e840ddb3a67f8c2fbd68 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 27 Oct 2016 16:08:14 -0200
Subject: [PATCH] Address review comments and do further filtering and cleanup
 on the transcription data

---
 util/importers/fisher.py | 127 +++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 57 deletions(-)

diff --git a/util/importers/fisher.py b/util/importers/fisher.py
index 683919c3..8cc4e701 100644
--- a/util/importers/fisher.py
+++ b/util/importers/fisher.py
@@ -13,7 +13,7 @@ from Queue import Queue
 from threading import Thread
 from util.audio import audiofile_to_input_vector
 from util.gpu import get_available_gpus
-from util.text import texts_to_sparse_tensor
+from util.text import texts_to_sparse_tensor, validate_label
 
 class DataSets(object):
     def __init__(self, train, dev, test):
@@ -69,25 +69,30 @@ class DataSet(object):
     
     def _populate_batch_queue(self):
         with self._graph.as_default():
-            while True:
-                n_steps = 0
-                sources = []
-                targets = []
-                for index, (txt_file, wav_file) in enumerate(self._files_circular_list):
-                    if index >= self._batch_size:
-                        break
-                    next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
-                    if n_steps < next_source.shape[0]:
-                        n_steps = next_source.shape[0]
-                    sources.append(next_source)
-                    with open(txt_file) as open_txt_file:
-                        targets.append(open_txt_file.read())
-                target = texts_to_sparse_tensor(targets)
-                for index, next_source in enumerate(sources):
-                    npad = ((0,(n_steps - next_source.shape[0])), (0,0))
-                    sources[index] = np.pad(next_source, pad_width=npad, mode="constant")
-                source = np.array(sources)
-                self._batch_queue.put((source, target))
+            n_steps = 0
+            sources = []
+            targets = []
+            batch_index = 0
+            for txt_file, wav_file in self._files_circular_list:
+                if batch_index == self._batch_size:
+                    # Put batch on queue
+                    target = texts_to_sparse_tensor(targets)
+                    for index, next_source in enumerate(sources):
+                        npad = ((0,(n_steps - next_source.shape[0])), (0,0))
+                        sources[index] = np.pad(next_source, pad_width=npad, mode='constant')
+                    source = np.array(sources)
+                    self._batch_queue.put((source, target))
+                    n_steps = 0
+                    sources = []
+                    targets = []
+                    batch_index = 0
+                next_source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext)
+                if n_steps < next_source.shape[0]:
+                    n_steps = next_source.shape[0]
+                sources.append(next_source)
+                with open(txt_file) as open_txt_file:
+                    targets.append(open_txt_file.read())
+                batch_index = batch_index + 1
     
     def next_batch(self):
         source, target = self._batch_queue.get()
@@ -106,25 +111,25 @@ def read_data_sets(graph, data_dir, batch_size, numcep, numcontext, thread_count
     _maybe_convert_wav(data_dir, "LDC2005S13", "fisher-2005-wav")
 
     # Conditionally split Fisher wav data
-    _maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
-    _maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
+    _maybe_split_wav(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav", "fisher-2004-split-wav")
+    _maybe_split_wav(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav", "fisher-2005-split-wav")
 
     # Conditionally split Fisher transcriptions
-    _maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-wav")
-    _maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-wav")
+    _maybe_split_transcriptions(data_dir, os.path.join("LDC2004T19", "fe_03_p1_tran", "data", "trans"), "fisher-2004-split-wav")
+    _maybe_split_transcriptions(data_dir, os.path.join("LDC2005T19", "fe_03_p2_tran", "data", "trans"), "fisher-2005-split-wav")
     
     # Conditionally split Fisher data into train/validation/test sets
-    _maybe_split_sets(data_dir, "fisher-2004-wav", "fisher-2004-wav-splits")
-    _maybe_split_sets(data_dir, "fisher-2005-wav", "fisher-2005-wav-splits")
-
+    _maybe_split_sets(data_dir, "fisher-2004-split-wav", "fisher-2004-split-wav-sets")
+    _maybe_split_sets(data_dir, "fisher-2005-split-wav", "fisher-2005-split-wav-sets")
+    
     # Create train DataSet
-    train = _read_data_set(graph, data_dir, "fisher-200?-wav/train", thread_count, batch_size, numcep, numcontext)
+    train = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/train", thread_count, batch_size, numcep, numcontext)
 
     # Create dev DataSet
-    dev = _read_data_set(graph, data_dir, "fisher-200?-wav/dev", thread_count, batch_size, numcep, numcontext)
+    dev = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/dev", thread_count, batch_size, numcep, numcontext)
 
     # Create test DataSet
-    test = _read_data_set(graph, data_dir, "fisher-200?-wav/test", thread_count, batch_size, numcep, numcontext)
+    test = _read_data_set(graph, data_dir, "fisher-200?-split-wav-sets/test", thread_count, batch_size, numcep, numcontext)
 
     # Return DataSets
     return DataSets(train, dev, test)
@@ -134,18 +139,21 @@ def _maybe_convert_wav(data_dir, original_data, converted_data):
     target_dir = os.path.join(data_dir, converted_data)
     
     # Conditionally convert sph files to wav files
-    if not os.path.exists(target_dir):
-        # Create target_dir
-        os.makedirs(target_dir)
-        
-        # Loop over sph files in source_dir and convert each to 16-bit PCM wav
-        for root, dirnames, filenames in os.walk(source_dir):
-            for filename in fnmatch.filter(filenames, "*.sph"):
-                sph_file = os.path.join(root, filename)
-                wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
-                wav_file = os.path.join(target_dir, wav_filename)
-                print("converting {} to {}".format(sph_file, wav_file))
-                subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
+    if os.path.exists(target_dir):
+        print("skipping maybe_convert_wav")
+        return
+    
+    # Create target_dir
+    os.makedirs(target_dir)
+    
+    # Loop over sph files in source_dir and convert each to 16-bit PCM wav
+    for root, dirnames, filenames in os.walk(source_dir):
+        for filename in fnmatch.filter(filenames, "*.sph"):
+            sph_file = os.path.join(root, filename)
+            wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + ".wav"
+            wav_file = os.path.join(target_dir, wav_filename)
+            print("converting {} to {}".format(sph_file, wav_file))
+            subprocess.check_call(["sph2pipe", "-p", "-f", "rif", sph_file, wav_file])
 
 def _parse_transcriptions(trans_file):
     segments = []
@@ -174,23 +182,26 @@ def _parse_transcriptions(trans_file):
             })
     return segments
 
-def _maybe_split_wav(data_dir, original_data, converted_data):
+def _maybe_split_wav(data_dir, trans_data, original_data, converted_data):
+    trans_dir = os.path.join(data_dir, trans_data)
     source_dir = os.path.join(data_dir, original_data)
     target_dir = os.path.join(data_dir, converted_data)
     
     if os.path.exists(target_dir):
-        print("skipping split_wav")
+        print("skipping maybe_split_wav")
         return
     
+    os.makedirs(target_dir)
+    
     # Loop over transcription files and split corresponding wav
-    for root, dirnames, filenames in os.walk(source_dir):
+    for root, dirnames, filenames in os.walk(trans_dir):
         for filename in fnmatch.filter(filenames, "*.txt"):
             trans_file = os.path.join(root, filename)
             segments = _parse_transcriptions(trans_file)
             
             # Open wav corresponding to transcription file
             wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + ".wav"
-            wav_file = os.path.join(target_dir, wav_filename)
+            wav_file = os.path.join(source_dir, wav_filename)
             
             print("splitting {} according to {}".format(wav_file, trans_file))
             
@@ -212,7 +223,7 @@ def _maybe_split_wav(data_dir, original_data, converted_data):
             origAudio.close()
             
             # Remove wav_file
-            os.remove(wav_file)
+            # os.remove(wav_file)
 
 def _split_wav(origAudio, start_time, stop_time, new_wav_file):
     frameRate = origAudio.getframerate()
@@ -229,8 +240,8 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
     source_dir = os.path.join(data_dir, original_data)
     target_dir = os.path.join(data_dir, converted_data)
     
-    if os.path.exists(target_dir):
-        print("skipping split_transcriptions")
+    if os.path.exists(os.path.join(source_dir, "split_transcriptions_done")):
+        print("skipping maybe_split_transcriptions")
         return
     
     # Loop over transcription files and split them into individual files for
@@ -247,19 +258,21 @@ def _maybe_split_transcriptions(data_dir, original_data, converted_data):
                 txt_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".txt"
                 txt_file = os.path.join(target_dir, txt_filename)
                 
-                # If the txt segment filename does not exist create it
-                if not os.path.exists(txt_file):
+                transcript = validate_label(segment["transcript"])
+                
+                # If the transcript is valid and the txt segment filename does
+                # not exist create it
+                if transcript != None and not os.path.exists(txt_file):
                     with open(txt_file, "w") as fout:
-                        fout.write(segment["transcript"])
-            
+                        fout.write(transcript)
+    
+    with open(os.path.join(source_dir, "split_transcriptions_done"), "w") as fout:
+        fout.write("This file signals to the importer than the transcription of this source dir has already been completed.")
+    
 def _maybe_split_sets(data_dir, original_data, converted_data):
     source_dir = os.path.join(data_dir, original_data)
     target_dir = os.path.join(data_dir, converted_data)
     
-    if os.path.exists(target_dir):
-        print("skipping split_sets")
-        return
-    
     filelist = sorted(glob(os.path.join(source_dir, "*.txt")))
     
     # We initially split the entire set into 80% train and 20% test, then