"Untitled" By CookiePPP (https://pastebin.com/u/CookiePPP) URL: https://pastebin.com/zueesbwR Created on: Sunday 5th of January 2020 01:01:14 AM CDT Retrieved on: Saturday 31 of October 2020 11:08:54 PM UTC #!/usr/bin/env python # -*- coding: utf-8 -*- from pydub import AudioSegment from tqdm import tqdm import os from random import shuffle import utils.audio_converter as converter # ARPAbet block dictionary_path = r"/media/cookie/Samsung PM961/TwiBot/tacotron2/filelists/merged.dict_.txt" print("Running, Please wait...") thisdict = {} for line in reversed((open(dictionary_path, "r").read()).splitlines()): thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() print("Dictionary Ready.") # Functions def concat_text(filenames, outpath): with open(outpath, 'w') as outfile: nan = 0 for fname in filenames: if nan == 1: outfile.write("\n") # add newlines (\n) between each file else: nan = 1 with open(fname) as infile: for line in infile: outfile.write(line) def arpabet(input_path, output_path, encoding="utf-8"): errored_words = "" sym = list("??????•???????¤????¶§?????????") # ? = new line output_string = "" for line in ((open(input_path, "r").read()).splitlines()): phoneme_embed = "" for i in sym: if i in line: phoneme_embed = phoneme_embed + i line = line.replace(i,"") out = '' for word_ in (line.split("|")[1]).split(" "): word=word_; end_chars = '' while any(elem in word for elem in r"!?,.;") and len(word) > 1: if word[-1] in ['!','?',',','.',';',';',':',"'","?","-","_"]: end_chars = word[-1] + end_chars; word = word[:-1] else: break try: word_arpa = thisdict[word.upper()] except: word_arpa = '' if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}" out = (out + " " + word + end_chars).strip() output_string = output_string + line.split("|")[0] + "|" + phoneme_embed + out + "?|" + line.split("|")[2] + "\n" output_string = output_string.replace("??","?") text_file = open(output_path, "w", encoding=encoding) text_file.write(output_string) text_file.close() metadata = {} # metadata["celestia"] = [{file_path: "", timestamp: "00_00_05", emotions: ["neutral"], noise_level: "", quote = "Once upon a time."}, .... , ....] def build_metadata(ignore_dirs=["Songs","Noise samples"]): # uses Global "directory" for recursive directory search, for every .flac file it will find the accompanying label and add to metadata. skip = 0 for dir_ in [x[0] for x in os.walk(directory)]: # recursive directory search if len(os.listdir(dir_)) < 1: continue for directory_filter in ignore_dirs: if directory_filter in dir_: skip = 1 if skip: skip = 0; continue for filename in os.listdir(dir_): if filename.endswith(".wav"): file_path = os.path.join(dir_,filename) splitted = filename.split("_") try: timestamp = "_".join(splitted[0:3]) # 00_00_05 voice = splitted[3].lower() # celestia emotions = splitted[4].lower().split(" ") # neutral noise_level = splitted[5].lower() # "" = clean, "noisy" = Noisy, "very noisy" = Very Noisy filename_quote = splitted[6] # missing question marks except: print("'"+os.path.join(dir_,filename)+"' is not a valid file") try: with open(os.path.join(dir_,filename.replace(".wav",".txt")), 'r', encoding="latin-1") as file: txt_quote = file.read().replace('\n', '') # Once upon a time. except: print("txt for '"+str(os.path.join(dir_,filename))+"' is missing") continue if voice.lower() in list(metadata.keys()): metadata[str(voice).lower()].append({"file_path": file_path, "timestamp": timestamp, "emotions": emotions, "noise_level": noise_level, "quote": txt_quote}) else: metadata[str(voice).lower()] = [{"file_path": file_path, "timestamp": timestamp, "emotions": emotions, "noise_level": noise_level, "quote": txt_quote}] else: continue def write_datasets(speaker_id = 0, permitted_noise_levels = [""], minimum_clips=3): multi_speaker_lines = [] for voice in list(metadata.keys()): meta = metadata[voice] # meta == [{file_path: "", timestamp: "00_00_05", emotions: ["neutral"], noise_level: "", quote = "Once upon a time."}, .... , ....] if len(meta) < minimum_clips: continue # ignore voices with less than 3 clips of audio single_speaker_lines = [] for clip in meta: if (clip["noise_level"] in permitted_noise_levels): single_speaker_lines.append(clip["file_path"]+"|"+clip["quote"]+"?") multi_speaker_lines.append (clip["file_path"]+"|"+clip["quote"]+"?|"+str(speaker_id)) speaker_id+=1 # next speaker_id for next voice # shuffle stuff shuffled_multi_speaker_lines = multi_speaker_lines shuffle(shuffled_multi_speaker_lines) num_clips = len(shuffled_multi_speaker_lines) train_end = int(num_clips * percentage_training_data) train_arr = shuffled_multi_speaker_lines[:train_end]; validation_arr = shuffled_multi_speaker_lines[train_end:] # also make unshuffled stuff (sorted by speaker_id) unshuffled_multi_speaker_lines = [] for i in range(len(list(metadata.keys()))): for line in multi_speaker_lines: if line.split("|")[2] == str(i): unshuffled_multi_speaker_lines.append(line) write_files(unshuffled_multi_speaker_lines, train_arr, validation_arr, output_directory_=directory) def write_files(multi_speaker_lines, train_arr, val_arr, output_directory_): output_directory = os.path.join(output_directory_,"filelists") if not os.path.exists(output_directory): os.makedirs(output_directory) # generate text dataset metadata text_file = open(os.path.join(output_directory,"unshuffled_taca2.txt"), "w", encoding="utf-8") text_file.write("\n".join(multi_speaker_lines)); text_file.close() # generate text dataset metadata text_file = open(os.path.join(output_directory,"train_taca2.txt"), "w", encoding="utf-8") text_file.write("\n".join(train_arr)); text_file.close() arpabet(os.path.join(output_directory,"train_taca2.txt"),os.path.join(output_directory,"train_taca2_arpa.txt")) # generate arpabet dataset metadata text_file = open(os.path.join(output_directory,"validation_taca2.txt"), "w", encoding="utf-8") text_file.write("\n".join(val_arr)); text_file.close() arpabet(os.path.join(output_directory,"validation_taca2.txt"),os.path.join(output_directory,"validation_taca2_arpa.txt")) # generate merged dataset metadata concat_text([os.path.join(output_directory,"train_taca2.txt"), os.path.join(output_directory,"train_taca2_arpa.txt")], os.path.join(output_directory,"train_taca2_merged.txt")) concat_text([os.path.join(output_directory,"validation_taca2.txt"), os.path.join(output_directory,"validation_taca2_arpa.txt")], os.path.join(output_directory,"validation_taca2_merged.txt")) # generate mel text dataset metadata text_file = open(os.path.join(output_directory,"mel_train_taca2.txt"), "w", encoding="utf-8") text_file.write("\n".join(train_arr).replace(".wav|",".npy|")); text_file.close() arpabet(os.path.join(output_directory,"mel_train_taca2.txt"),os.path.join(output_directory,"mel_train_taca2_arpa.txt")) # generate mel arpabet dataset metadata text_file = open(os.path.join(output_directory,"mel_validation_taca2.txt"), "w", encoding="utf-8") text_file.write("\n".join(val_arr).replace(".wav|",".npy|")); text_file.close() arpabet(os.path.join(output_directory,"mel_validation_taca2.txt"),os.path.join(output_directory,"mel_validation_taca2_arpa.txt")) # generate mel merged dataset metadata concat_text([os.path.join(output_directory,"mel_train_taca2.txt"), os.path.join(output_directory,"mel_train_taca2_arpa.txt")], os.path.join(output_directory,"mel_train_taca2_merged.txt")) concat_text([os.path.join(output_directory,"mel_validation_taca2.txt"), os.path.join(output_directory,"mel_validation_taca2_arpa.txt")], os.path.join(output_directory,"mel_validation_taca2_merged.txt")) def convert_dir_to_wav(directory, SAMPLE_RATE=48000, ignore_dirs=["Songs","Noise samples"]): skip = 0 for dir_ in tqdm([x[0] for x in os.walk(directory)]): # recursive directory search for directory_filter in ignore_dirs: if directory_filter in dir_: skip = 1 if skip: skip = 0; continue for filename in os.listdir(dir_): if filename.endswith(".flac"): file_path = os.path.join(dir_,filename) tqdm.write(file_path+" --> "+file_path.replace(".flac",".wav")) converter.flac2wav(file_path, file_path.replace(".flac",".wav"), "flac", frame_rate=SAMPLE_RATE, sample_width=2) # sample_width is bit_depth in bytes eg: 2 = 16 bit audio. def set_wavs_to_mono(directory): for dir_ in tqdm([x[0] for x in os.walk(directory)]): # recursive directory search for filename in os.listdir(dir_): if filename.endswith(".wav"): file_path = os.path.join(dir_,filename) tqdm.write(file_path) sound = AudioSegment.from_wav(file_path) sound = sound.set_channels(1) sound.export(file_path, format="wav") # Main block directory = r"/media/cookie/StableHDD/ClipperDatasetV2/SlicedDialogueTrimmed" percentage_training_data = 0.95 build_metadata(ignore_dirs=["Songs","Noise samples"]) write_datasets(permitted_noise_levels = [""], minimum_clips=3) convert_dir_to_wav(directory) set_wavs_to_mono(directory)