mirror of
https://github.com/mozilla/DeepSpeech.git
synced 2025-10-26 11:19:39 +00:00
154 lines
5.5 KiB
Python
Executable File
154 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
# Make sure we can import stuff from util/
|
|
# This script needs to be run from the root of the DeepSpeech repository
|
|
import sys
|
|
import os
|
|
sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
|
|
|
import codecs
|
|
import pandas
|
|
import tarfile
|
|
import unicodedata
|
|
import wave
|
|
|
|
from glob import glob
|
|
from os import makedirs, path, remove, rmdir
|
|
from sox import Transformer
|
|
from util.downloader import maybe_download
|
|
from tensorflow.python.platform import gfile
|
|
from util.stm import parse_stm_file
|
|
|
|
def _download_and_preprocess_data(data_dir):
|
|
# Conditionally download data
|
|
TED_DATA = "TEDLIUM_release2.tar.gz"
|
|
TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
|
|
local_file = maybe_download(TED_DATA, data_dir, TED_DATA_URL)
|
|
|
|
# Conditionally extract TED data
|
|
TED_DIR = "TEDLIUM_release2"
|
|
_maybe_extract(data_dir, TED_DIR, local_file)
|
|
|
|
# Conditionally convert TED sph data to wav
|
|
_maybe_convert_wav(data_dir, TED_DIR)
|
|
|
|
# Conditionally split TED wav and text data into sentences
|
|
train_files, dev_files, test_files = _maybe_split_sentences(data_dir, TED_DIR)
|
|
|
|
# Write sets to disk as CSV files
|
|
train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False)
|
|
dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False)
|
|
test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
|
|
|
|
def _maybe_extract(data_dir, extracted_data, archive):
|
|
# If data_dir/extracted_data does not exist, extract archive in data_dir
|
|
if not gfile.Exists(path.join(data_dir, extracted_data)):
|
|
tar = tarfile.open(archive)
|
|
tar.extractall(data_dir)
|
|
tar.close()
|
|
|
|
def _maybe_convert_wav(data_dir, extracted_data):
|
|
# Create extracted_data dir
|
|
extracted_dir = path.join(data_dir, extracted_data)
|
|
|
|
# Conditionally convert dev sph to wav
|
|
_maybe_convert_wav_dataset(extracted_dir, "dev")
|
|
|
|
# Conditionally convert train sph to wav
|
|
_maybe_convert_wav_dataset(extracted_dir, "train")
|
|
|
|
# Conditionally convert test sph to wav
|
|
_maybe_convert_wav_dataset(extracted_dir, "test")
|
|
|
|
def _maybe_convert_wav_dataset(extracted_dir, data_set):
|
|
# Create source dir
|
|
source_dir = path.join(extracted_dir, data_set, "sph")
|
|
|
|
# Create target dir
|
|
target_dir = path.join(extracted_dir, data_set, "wav")
|
|
|
|
# Conditionally convert sph files to wav files
|
|
if not gfile.Exists(target_dir):
|
|
# Create target_dir
|
|
makedirs(target_dir)
|
|
|
|
# Loop over sph files in source_dir and convert each to wav
|
|
for sph_file in glob(path.join(source_dir, "*.sph")):
|
|
transformer = Transformer()
|
|
wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
|
|
wav_file = path.join(target_dir, wav_filename)
|
|
transformer.build(sph_file, wav_file)
|
|
remove(sph_file)
|
|
|
|
# Remove source_dir
|
|
rmdir(source_dir)
|
|
|
|
def _maybe_split_sentences(data_dir, extracted_data):
|
|
# Create extracted_data dir
|
|
extracted_dir = path.join(data_dir, extracted_data)
|
|
|
|
# Conditionally split dev wav
|
|
dev_files = _maybe_split_dataset(extracted_dir, "dev")
|
|
|
|
# Conditionally split train wav
|
|
train_files = _maybe_split_dataset(extracted_dir, "train")
|
|
|
|
# Conditionally split test wav
|
|
test_files = _maybe_split_dataset(extracted_dir, "test")
|
|
|
|
return train_files, dev_files, test_files
|
|
|
|
def _maybe_split_dataset(extracted_dir, data_set):
|
|
# Create stm dir
|
|
stm_dir = path.join(extracted_dir, data_set, "stm")
|
|
|
|
# Create wav dir
|
|
wav_dir = path.join(extracted_dir, data_set, "wav")
|
|
|
|
files = []
|
|
|
|
# Loop over stm files and split corresponding wav
|
|
for stm_file in glob(path.join(stm_dir, "*.stm")):
|
|
# Parse stm file
|
|
stm_segments = parse_stm_file(stm_file)
|
|
|
|
# Open wav corresponding to stm_file
|
|
wav_filename = path.splitext(path.basename(stm_file))[0] + ".wav"
|
|
wav_file = path.join(wav_dir, wav_filename)
|
|
origAudio = wave.open(wav_file,'r')
|
|
|
|
# Loop over stm_segments and split wav_file for each segment
|
|
for stm_segment in stm_segments:
|
|
# Create wav segment filename
|
|
start_time = stm_segment.start_time
|
|
stop_time = stm_segment.stop_time
|
|
new_wav_filename = path.splitext(path.basename(stm_file))[0] + "-" + str(start_time) + "-" + str(stop_time) + ".wav"
|
|
new_wav_file = path.join(wav_dir, new_wav_filename)
|
|
|
|
# If the wav segment filename does not exist create it
|
|
if not gfile.Exists(new_wav_file):
|
|
_split_wav(origAudio, start_time, stop_time, new_wav_file)
|
|
|
|
new_wav_filesize = path.getsize(new_wav_file)
|
|
files.append((path.abspath(new_wav_file), new_wav_filesize, stm_segment.transcript))
|
|
|
|
# Close origAudio
|
|
origAudio.close()
|
|
|
|
return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
|
|
|
|
def _split_wav(origAudio, start_time, stop_time, new_wav_file):
|
|
frameRate = origAudio.getframerate()
|
|
origAudio.setpos(int(start_time*frameRate))
|
|
chunkData = origAudio.readframes(int((stop_time - start_time)*frameRate))
|
|
chunkAudio = wave.open(new_wav_file,'w')
|
|
chunkAudio.setnchannels(origAudio.getnchannels())
|
|
chunkAudio.setsampwidth(origAudio.getsampwidth())
|
|
chunkAudio.setframerate(frameRate)
|
|
chunkAudio.writeframes(chunkData)
|
|
chunkAudio.close()
|
|
|
|
if __name__ == "__main__":
|
|
_download_and_preprocess_data(sys.argv[1])
|