DeepSpeech/bin/import_swb.py

#!/usr/bin/env python
from __future__ import absolute_import, division, print_function

# Make sure we can import stuff from util/
# This script needs to be run from the root of the DeepSpeech repository

# ensure that you have downloaded the LDC dataset LDC97S62 and tar exists in a folder e.g.
# ./data/swb/swb1_LDC97S62.tgz
# from the deepspeech directory run with: ./bin/import_swb.py ./data/swb/

import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

import fnmatch
import pandas
import subprocess
import unicodedata
import wave
import codecs
import tarfile
import requests
from util.text import validate_label
import librosa
import soundfile # <= Has an external dependency on libsndfile

# ARCHIVE_NAME refers to ISIP alignments from 01/29/03
ARCHIVE_NAME = 'switchboard_word_alignments.tar.gz'
ARCHIVE_URL = 'http://www.openslr.org/resources/5/'
ARCHIVE_DIR_NAME = 'LDC97S62'
LDC_DATASET = 'swb1_LDC97S62.tgz'


def download_file(folder, url):
    # https://stackoverflow.com/a/16696317/738515
    local_filename = url.split('/')[-1]
    full_filename = os.path.join(folder, local_filename)
    r = requests.get(url, stream=True)
    with open(full_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
    return full_filename


def maybe_download(archive_url, target_dir, ldc_dataset):
    # If archive file does not exist, download it...
    archive_path = os.path.join(target_dir, ldc_dataset)
    ldc_path = archive_url+ldc_dataset
    if not os.path.exists(target_dir):
        print('No path "%s" - creating ...' % target_dir)
        makedirs(target_dir)

    if not os.path.exists(archive_path):
        print('No archive "%s" - downloading...' % archive_path)
        download_file(target_dir, ldc_path)
    else:
        print('Found archive "%s" - not downloading.' % archive_path)
    return archive_path


def _download_and_preprocess_data(data_dir):
    new_data_dir = os.path.join(data_dir, ARCHIVE_DIR_NAME)
    target_dir = os.path.abspath(new_data_dir)
    archive_path = os.path.abspath(os.path.join(data_dir, LDC_DATASET))

    # Check swb1_LDC97S62.tgz then extract
    assert(os.path.isfile(archive_path))
    _extract(target_dir, archive_path)

    # Transcripts
    transcripts_path = maybe_download(ARCHIVE_URL, target_dir, ARCHIVE_NAME)
    _extract(target_dir, transcripts_path)

    # Check swb1_d1/2/3/4/swb_ms98_transcriptions
    expected_folders = ["swb1_d1","swb1_d2","swb1_d3","swb1_d4","swb_ms98_transcriptions"]
    assert(all([os.path.isdir(os.path.join(target_dir,e)) for e in expected_folders]))

    # Conditionally convert swb sph data to wav
    _maybe_convert_wav(target_dir, "swb1_d1", "swb1_d1-wav")
    _maybe_convert_wav(target_dir, "swb1_d2", "swb1_d2-wav")
    _maybe_convert_wav(target_dir, "swb1_d3", "swb1_d3-wav")
    _maybe_convert_wav(target_dir, "swb1_d4", "swb1_d4-wav")

    # Conditionally split wav data
    d1 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d1-wav", "swb1_d1-split-wav")
    d2 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d2-wav", "swb1_d2-split-wav")
    d3 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d3-wav", "swb1_d3-split-wav")
    d4 = _maybe_split_wav_and_sentences(target_dir, "swb_ms98_transcriptions", "swb1_d4-wav", "swb1_d4-split-wav")

    swb_files = d1.append(d2).append(d3).append(d4)

    train_files, dev_files, test_files = _split_sets(swb_files)

    # Write sets to disk as CSV files
    train_files.to_csv(os.path.join(target_dir, "swb-train.csv"), index=False)
    dev_files.to_csv(os.path.join(target_dir, "swb-dev.csv"), index=False)
    test_files.to_csv(os.path.join(target_dir, "swb-test.csv"), index=False)


def _extract(target_dir, archive_path):
    with tarfile.open(archive_path) as tar:
        tar.extractall(target_dir)


def _maybe_convert_wav(data_dir, original_data, converted_data):
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)

    # Conditionally convert sph files to wav files
    if os.path.exists(target_dir):
        print("skipping maybe_convert_wav")
        return

    # Create target_dir
    os.makedirs(target_dir)

    # Loop over sph files in source_dir and convert each to 16-bit PCM wav
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.sph"):
            for channel in ['1', '2']:
                sph_file = os.path.join(root, filename)
                wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + ".wav"
                wav_file = os.path.join(target_dir, wav_filename)
                temp_wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + "-temp.wav"
                temp_wav_file = os.path.join(target_dir, temp_wav_filename)
                print("converting {} to {}".format(sph_file, temp_wav_file))
                subprocess.check_call(["sph2pipe", "-c", channel, "-p", "-f", "rif", sph_file, temp_wav_file])
                print("upsampling {} to {}".format(temp_wav_file, wav_file))
                audioData, frameRate = librosa.load(temp_wav_file, sr=16000, mono=True)
                soundfile.write(wav_file, audioData, frameRate, "PCM_16")
                os.remove(temp_wav_file)


def _parse_transcriptions(trans_file):
    segments = []
    with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
            if line.startswith("#")  or len(line) <= 1:
                continue

            tokens = line.split()
            start_time = float(tokens[1])
            stop_time = float(tokens[2])
            transcript = validate_label(" ".join(tokens[3:]))

            if transcript == None:
                continue

            # We need to do the encode-decode dance here because encode
            # returns a bytes() object on Python 3, and text_to_char_array
            # expects a string.
            transcript = unicodedata.normalize("NFKD", transcript)  \
                                    .encode("ascii", "ignore")      \
                                    .decode("ascii", "ignore")

            segments.append({
                "start_time": start_time,
                "stop_time": stop_time,
                "transcript": transcript,
            })
    return segments


def _maybe_split_wav_and_sentences(data_dir, trans_data, original_data, converted_data):
    trans_dir = os.path.join(data_dir, trans_data)
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)
    if os.path.exists(target_dir):
        print("skipping maybe_split_wav")
        return

    os.makedirs(target_dir)

    files = []

    # Loop over transcription files and split corresponding wav
    for root, dirnames, filenames in os.walk(trans_dir):
        for filename in fnmatch.filter(filenames, "*.text"):
            if "trans" not in filename:
                continue
            trans_file = os.path.join(root, filename)
            segments = _parse_transcriptions(trans_file)

            # Open wav corresponding to transcription file
            channel = ("2","1")[(os.path.splitext(os.path.basename(trans_file))[0])[6] == 'A']
            wav_filename = "sw0" + (os.path.splitext(os.path.basename(trans_file))[0])[2:6] + "-" + channel + ".wav"
            wav_file = os.path.join(source_dir, wav_filename)

            print("splitting {} according to {}".format(wav_file, trans_file))

            if not os.path.exists(wav_file):
                print("skipping. does not exist:" + wav_file)
                continue

            origAudio = wave.open(wav_file, "r")

            # Loop over segments and split wav_file for each segment
            for segment in segments:
                # Create wav segment filename
                start_time = segment["start_time"]
                stop_time = segment["stop_time"]
                new_wav_filename = os.path.splitext(os.path.basename(trans_file))[0] + "-" + str(
                    start_time) + "-" + str(stop_time) + ".wav"
                if _is_wav_too_short(new_wav_filename):
                  continue
                new_wav_file = os.path.join(target_dir, new_wav_filename)

                _split_wav(origAudio, start_time, stop_time, new_wav_file)

                new_wav_filesize = os.path.getsize(new_wav_file)
                transcript = segment["transcript"]
                files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript))

            # Close origAudio
            origAudio.close()

    return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])


def _is_wav_too_short(wav_filename):
    short_wav_filenames = ['sw2986A-ms98-a-trans-80.6385-83.358875.wav', 'sw2663A-ms98-a-trans-161.12025-164.213375.wav']
    return wav_filename in short_wav_filenames


def _split_wav(origAudio, start_time, stop_time, new_wav_file):
    frameRate = origAudio.getframerate()
    origAudio.setpos(int(start_time * frameRate))
    chunkData = origAudio.readframes(int((stop_time - start_time) * frameRate))
    chunkAudio = wave.open(new_wav_file, "w")
    chunkAudio.setnchannels(origAudio.getnchannels())
    chunkAudio.setsampwidth(origAudio.getsampwidth())
    chunkAudio.setframerate(frameRate)
    chunkAudio.writeframes(chunkData)
    chunkAudio.close()


def _split_sets(filelist):
    # We initially split the entire set into 80% train and 20% test, then
    # split the train set into 80% train and 20% validation.
    train_beg = 0
    train_end = int(0.8 * len(filelist))

    dev_beg = int(0.8 * train_end)
    dev_end = train_end
    train_end = dev_beg

    test_beg = dev_end
    test_end = len(filelist)

    return (filelist[train_beg:train_end], filelist[dev_beg:dev_end], filelist[test_beg:test_end])


def _read_data_set(filelist, thread_count, batch_size, numcep, numcontext, stride=1, offset=0, next_index=lambda i: i + 1, limit=0):
    # Optionally apply dataset size limit
    if limit > 0:
        filelist = filelist.iloc[:limit]

    filelist = filelist[offset::stride]

    # Return DataSet
    return DataSet(txt_files, thread_count, batch_size, numcep, numcontext, next_index=next_index)


if __name__ == "__main__":
    _download_and_preprocess_data(sys.argv[1])