Tacotron2+HifiGAN派蒙600語音合成模型下載

????模型使用谷歌的Colab進行訓練,沒錢買Colab+所以花了很長時間重連、訓練、重連、訓練;
????定的訓練目標是600,目前已經(jīng)全部訓練完了。
????模型大小為:322MB(338,426,303 字節(jié))
轉(zhuǎn)換音頻需要輸入拼音+音標數(shù)字
測試音頻:https://wwb.lanzoul.com/ia7gs0bcr6da
因為訓練的數(shù)據(jù)不一,所以不同的句子效果也不同,但UP感覺很接近了,雖然沒有VITS那樣優(yōu)秀;


因為模型大于100MB所以無法上傳到藍奏給大家分享;
谷歌云盤分享鏈接:https://drive.google.com/file/d/1I9kj7187xFyv9xapvmR-oBeILKX0gx9u/view?usp=sharing
另外群內(nèi)(一群)也上傳了文件,無法上谷歌的可以進群下載;
模型調(diào)用代碼:
#@markdown?Config:
#@markdown?Restart?the?code?to?apply?any?changes.
#Add?new?characters?here.
#Universal?HiFi-GAN?(has?some?robotic?noise):?1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
Tacotron2_Model?=?'/content/drive/MyDrive/colab/outdir/Paimon_test'#@param?{type:"string"}
TACOTRON2_ID?=?Tacotron2_Model
HIFIGAN_ID?=?"1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
from?pypinyin?import?lazy_pinyin,Style
#?Check?if?Initilized
try:
????initilized
except?NameError:
????print("Setting?up,?please?wait.\n")
????!pip?install?tqdm?-q
????from?tqdm.notebook?import?tqdm
????with?tqdm(total=5,?leave=False)?as?pbar:
????????%tensorflow_version?2.x
????????import?os
????????from?os.path?import?exists,?join,?basename,?splitext
????????!pip?install?gdown
????????git_repo_url?=?'https://github.com/NVIDIA/tacotron2.git'
????????project_name?=?splitext(basename(git_repo_url))[0]
????????if?not?exists(project_name):
????????????#?clone?and?install
????????????!git?clone?-q?--recursive?{git_repo_url}
????????????!git?clone?-q?--recursive?https://github.com/SortAnon/hifi-gan
????????????!pip?install?-q?librosa?unidecode
????????pbar.update(1)?#?downloaded?TT2?and?HiFi-GAN
????????import?sys
????????sys.path.append('hifi-gan')
????????sys.path.append(project_name)
????????import?time
????????import?matplotlib
????????import?matplotlib.pylab?as?plt
????????import?gdown
????????d?=?'https://drive.google.com/uc?id='
????????%matplotlib?inline
????????import?IPython.display?as?ipd
????????import?numpy?as?np
????????import?torch
????????import?json
????????from?hparams?import?create_hparams
????????from?model?import?Tacotron2
????????from?layers?import?TacotronSTFT
????????from?audio_processing?import?griffin_lim
????????from?text?import?text_to_sequence
????????from?env?import?AttrDict
????????from?meldataset?import?MAX_WAV_VALUE
????????from?models?import?Generator
????????pbar.update(1)?#?initialized?Dependancies
????????graph_width?=?900
????????graph_height?=?360
????????def?plot_data(data,?figsize=(int(graph_width/100),?int(graph_height/100))):
????????????%matplotlib?inline
????????????fig,?axes?=?plt.subplots(1,?len(data),?figsize=figsize)
????????????for?i?in?range(len(data)):
????????????????axes[i].imshow(data[i],?aspect='auto',?origin='bottom',?
????????????????????????????interpolation='none',?cmap='inferno')
????????????fig.canvas.draw()
????????????plt.show()
????????#?Setup?Pronounciation?Dictionary
????????!gdown?--id?'1E12g_sREdcH5vuZb44EZYX8JjGWQ9rRp'
????????thisdict?=?{}
????????for?line?in?reversed((open('merged.dict.txt',?"r").read()).splitlines()):
????????????thisdict[(line.split("?",1))[0]]?=?(line.split("?",1))[1].strip()
????????pbar.update(1)?#?Downloaded?and?Set?up?Pronounciation?Dictionary
????????def?ARPA(text,?punctuation=r"!?,.;",?EOS_Token=True):
????????????out?=?''
????????????for?word_?in?text.split("?"):
????????????????word=word_;?end_chars?=?''
????????????????while?any(elem?in?word?for?elem?in?punctuation)?and?len(word)?>?1:
????????????????????if?word[-1]?in?punctuation:?end_chars?=?word[-1]?+?end_chars;?word?=?word[:-1]
????????????????????else:?break
????????????????try:
????????????????????word_arpa?=?thisdict[word.upper()]
????????????????????word?=?"{"?+?str(word_arpa)?+?"}"
????????????????except?KeyError:?pass
????????????????out?=?(out?+?"?"?+?word?+?end_chars).strip()
????????????if?EOS_Token?and?out[-1]?!=?";":?out?+=?";"
????????????return?out
????????def?get_hifigan(MODEL_ID):
????????????#?Download?HiFi-GAN
????????????hifigan_pretrained_model?=?'hifimodel'
????????????gdown.download(d+MODEL_ID,?hifigan_pretrained_model,?quiet=False)
????????????if?not?exists(hifigan_pretrained_model):
????????????????raise?Exception("HiFI-GAN?model?failed?to?download!")
????????????#?Load?HiFi-GAN
????????????conf?=?os.path.join("hifi-gan",?"config_v1.json")
????????????with?open(conf)?as?f:
????????????????json_config?=?json.loads(f.read())
????????????h?=?AttrDict(json_config)
????????????torch.manual_seed(h.seed)
????????????hifigan?=?Generator(h).to(torch.device("cuda"))
????????????state_dict_g?=?torch.load(hifigan_pretrained_model,?map_location=torch.device("cuda"))
????????????hifigan.load_state_dict(state_dict_g["generator"])
????????????hifigan.eval()
????????????hifigan.remove_weight_norm()
????????????return?hifigan,?h
????????hifigan,?h?=?get_hifigan(HIFIGAN_ID)
????????pbar.update(1)?#?Downloaded?and?Set?up?HiFi-GAN
????????def?has_MMI(STATE_DICT):
????????????return?any(True?for?x?in?STATE_DICT.keys()?if?"mi."?in?x)
????????def?get_Tactron2(MODEL_ID):
????????????#?Download?Tacotron2
????????????tacotron2_pretrained_model?=?TACOTRON2_ID
????????????if?not?exists(tacotron2_pretrained_model):
????????????????raise?Exception("Tacotron2?model?failed?to?download!")
????????????#?Load?Tacotron2?and?Config
????????????hparams?=?create_hparams()
????????????hparams.sampling_rate?=?22050
????????????hparams.max_decoder_steps?=?3000?#?Max?Duration
????????????hparams.gate_threshold?=?0.25?#?Model?must?be?25%?sure?the?clip?is?over?before?ending?generation
????????????model?=?Tacotron2(hparams)
????????????state_dict?=?torch.load(tacotron2_pretrained_model)['state_dict']
????????????if?has_MMI(state_dict):
????????????????raise?Exception("ERROR:?This?notebook?does?not?currently?support?MMI?models.")
????????????model.load_state_dict(state_dict)
????????????_?=?model.cuda().eval().half()
????????????return?model,?hparams
????????model,?hparams?=?get_Tactron2(TACOTRON2_ID)
????????previous_tt2_id?=?TACOTRON2_ID
????????pbar.update(1)?#?Downloaded?and?Set?up?Tacotron2
????????#?Extra?Info
????????def?end_to_end_infer(text,?pronounciation_dictionary,?show_graphs):
????????????for?i?in?[x?for?x?in?text.split("\n")?if?len(x)]:
????????????????if?not?pronounciation_dictionary:
????????????????????if?i[-1]?!=?";":?i=i+";"?
????????????????else:?i?=?ARPA(i)
????????????????with?torch.no_grad():?#?save?VRAM?by?not?including?gradients
????????????????????sequence?=?np.array(text_to_sequence(i,?['english_cleaners']))[None,?:]
????????????????????sequence?=?torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
????????????????????mel_outputs,?mel_outputs_postnet,?_,?alignments?=?model.inference(sequence)
????????????????????if?show_graphs:
????????????????????????plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
????????????????????????????????alignments.float().data.cpu().numpy()[0].T))
????????????????????y_g_hat?=?hifigan(mel_outputs_postnet.float())
????????????????????audio?=?y_g_hat.squeeze()
????????????????????audio?=?audio?*?MAX_WAV_VALUE
????????????????????print("")
????????????????????ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"),?rate=hparams.sampling_rate))
????from?IPython.display?import?clear_output
????clear_output()
????initilized?=?"Ready"
if?previous_tt2_id?!=?TACOTRON2_ID:
????print("Updating?Models")
????model,?hparams?=?get_Tactron2(TACOTRON2_ID)
????hifigan,?h?=?get_hifigan(HIFIGAN_ID)
????previous_tt2_id?=?TACOTRON2_ID
pronounciation_dictionary?=?False?#@param?{type:"boolean"}
#?disables?automatic?ARPAbet?conversion,?useful?for?inputting?your?own?ARPAbet?pronounciations?or?just?for?testing
show_graphs?=?True?#@param?{type:"boolean"}
max_duration?=?25?#this?does?nothing
model.decoder.max_decoder_steps?=?1000?#@param?{type:"integer"}
stop_threshold?=?0.3?#@param?{type:"number"}
model.decoder.gate_threshold?=?stop_threshold
#@markdown?---
print(f"Current?Config:\npronounciation_dictionary:?{pronounciation_dictionary}\nshow_graphs:?{show_graphs}\nmax_duration?(in?seconds):?{max_duration}\nstop_threshold:?{stop_threshold}\n\n")
time.sleep(1)
print("Enter/Paste?your?text.輸入拼音+數(shù)字表示聲調(diào),支持直接中文輸入")
contents?=?[]
while?True:
????try:
????????print("-"*50)
????????line?=?input()
????????if?line?!=?"":
??????????line?=?"?".join(lazy_pinyin(line,?style=Style.TONE3))
????????print(line)
????????end_to_end_infer(line,?pronounciation_dictionary,?show_graphs)
????except?EOFError:
????????break
????except?KeyboardInterrupt:
????????print("Stopping...")
????????break