import outetts
# Configure the model
model_config = outetts.HFModelConfig_v2(
model_path="OuteAI/OuteTTS-0.3-1B",
tokenizer_path="OuteAI/OuteTTS-0.3-1B"
)
# Initialize the interface
interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
# You can create a speaker profile for voice cloning, which is compatible across all backends.# speaker = interface.create_speaker(audio_path="path/to/audio/file.wav")# interface.save_speaker(speaker, "speaker.json")# speaker = interface.load_speaker("speaker.json")# Print available default speakers
interface.print_default_speakers()
# Load a default speaker
speaker = interface.load_default_speaker(name="en_male_1")
# Generate speech
gen_cfg = outetts.GenerationConfig(
text="Speech synthesis is the artificial production of human speech.",
temperature=0.1,
repetition_penalty=1.1,
max_length=4096,
speaker=speaker,
# voice_characteristics="upbeat enthusiasm, friendliness, clarity, professionalism, and trustworthiness"
)
output = interface.generate(config=gen_cfg)
# Save the generated speech to a file
output.save("output.wav")
noteã®æ å ±ããcubeã®å³ã»å·¦ã®ã¢ã¼ã¿ã¼ã®é度ããã³ç§»åæéãè¨ç®ãã¾ãããã®ã¨ãã«cubeã移åã§ããç¯å²(ã·ãã¥ã¬ã¼ã·ã§ã³ä¸)ã決ã¾ã£ã¦ããã®ã§ãè½ã¡ãªãããã«èª¿ç¯ããã¾ããã
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.
speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"speaker_embeddings": speaker_embedding})
sf.write("speech_pipeline.wav", speech["audio"], samplerate=speech["sampling_rate"])
transformers modelling codeã使ãæ¹æ³
以ä¸ã®ã³ã¼ãã§å®è¡ã§ãã¾ã
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
from datasets import load_dataset
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text="Hello, my dog is cute.", return_tensors="pt")
# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech_modelling.wav", speech.numpy(), samplerate=16000)
ã¾ãã¯å¦ç¿ãããããã®youtubeããVTuberã¨éVTuberã®ãã£ã³ãã«ã®æ å ±ãåå¾ãã¾ãã
(youtubeAPIã®å¶éã«ããå¤ãã¯åå¾ã§ããªãã§ã)
root@a895d360c83b:/work/espnet/egs2/ljspeech/tts1# ls-l exp/imdanboy/jets/decode_train.loss.ave/dev/wav/
total 69404
-rw-r--r-- 1 root root 235052 Feb 17 15:21 LJ049-0008.wav
-rw-r--r-- 1 root root 421420 Feb 17 15:22 LJ049-0009.wav
-rw-r--r-- 1 root root 138284 Feb 17 15:22 LJ049-0010.wav
-rw-r--r-- 1 root root 394284 Feb 17 15:22 LJ049-0011.wav
以ä¸ã¯æ¨è«å®è¡æã®ãã°ã§ã
root@a895d360c83b:/work/espnet/egs2/ljspeech/tts1# ./run.sh --skip_data_prepfalse--skip_traintrue--download_model imdanboy/jets
2025-02-17T15:11:01 (tts.sh:211:main) ./tts.sh --lang en --feats_type raw --fs22050--n_fft1024--n_shift256--token_type phn --cleaner tacotron --g2p g2p_en_no_space --train_config conf/train.yaml --inference_config conf/decode.yaml --train_set tr_no_dev --valid_set dev --test_sets dev eval1 --srctexts data/tr_no_dev/text --audio_format wav --skip_data_prepfalse--skip_traintrue--download_model imdanboy/jets
2025-02-17T15:11:02 (tts.sh:307:main) Stage 1: Data preparation for data/tr_no_dev, data/dev, etc.
2025-02-17T15:11:02 (data.sh:16:main) local/data.sh
2025-02-17T15:11:02 (data.sh:39:main) stage -1: Data Download
already exists. skipped.
2025-02-17T15:11:03 (data.sh:44:main) stage 0: Data Preparation
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory data/train
2025-02-17T15:20:07 (data.sh:77:main) stage 2: utils/subset_data_dir.sg
utils/subset_data_dir.sh: reducing #utt from 13100 to 500
utils/subset_data_dir.sh: reducing #utt from 500 to 250
utils/subset_data_dir.sh: reducing #utt from 500 to 250
utils/subset_data_dir.sh: reducing #utt from 13100 to 12600
2025-02-17T15:20:10 (data.sh:86:main) Successfully finished. [elapsed=548s]
2025-02-17T15:20:10 (tts.sh:323:main) Stage 2: Format wav.scp: data/ -> dump/raw/
utils/copy_data_dir.sh: copied data from data/tr_no_dev to dump/raw/org/tr_no_dev
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory dump/raw/org/tr_no_dev
2025-02-17T15:20:12 (format_wav_scp.sh:42:main) scripts/audio/format_wav_scp.sh --nj8--cmd run.pl --audio-format wav --fs22050 data/tr_no_dev/wav.scp dump/raw/org/tr_no_dev
2025-02-17T15:20:13 (format_wav_scp.sh:110:main)[info]: without segments
2025-02-17T15:20:34 (format_wav_scp.sh:142:main) Successfully finished. [elapsed=22s]
utils/copy_data_dir.sh: copied data from data/dev to dump/raw/org/dev
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory dump/raw/org/dev
2025-02-17T15:20:35 (format_wav_scp.sh:42:main) scripts/audio/format_wav_scp.sh --nj8--cmd run.pl --audio-format wav --fs22050 data/dev/wav.scp dump/raw/org/dev
2025-02-17T15:20:36 (format_wav_scp.sh:110:main)[info]: without segments
2025-02-17T15:20:39 (format_wav_scp.sh:142:main) Successfully finished. [elapsed=4s]
utils/copy_data_dir.sh: copied data from data/dev to dump/raw/org/dev
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory dump/raw/org/dev
2025-02-17T15:20:40 (format_wav_scp.sh:42:main) scripts/audio/format_wav_scp.sh --nj8--cmd run.pl --audio-format wav --fs22050 data/dev/wav.scp dump/raw/org/dev
2025-02-17T15:20:41 (format_wav_scp.sh:110:main)[info]: without segments
2025-02-17T15:20:44 (format_wav_scp.sh:142:main) Successfully finished. [elapsed=4s]
utils/copy_data_dir.sh: copied data from data/eval1 to dump/raw/eval1
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory dump/raw/eval1
2025-02-17T15:20:45 (format_wav_scp.sh:42:main) scripts/audio/format_wav_scp.sh --nj8--cmd run.pl --audio-format wav --fs22050 data/eval1/wav.scp dump/raw/eval1
2025-02-17T15:20:45 (format_wav_scp.sh:110:main)[info]: without segments
2025-02-17T15:20:49 (format_wav_scp.sh:142:main) Successfully finished. [elapsed=4s]
2025-02-17T15:20:49 (tts.sh:468:main) Stage 3: Remove long/short data: dump/raw/org -> dump/raw
utils/copy_data_dir.sh: copied data from dump/raw/org/tr_no_dev to dump/raw/tr_no_dev
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory dump/raw/tr_no_dev
fix_data_dir.sh: kept all 12600 utterances.
fix_data_dir.sh: old files are kept in dump/raw/tr_no_dev/.backup
utils/copy_data_dir.sh: copied data from dump/raw/org/dev to dump/raw/dev
utils/validate_data_dir.sh: WARNING: you have only one speaker. This probably a bad idea.
Search for the word 'bold'in http://kaldi-asr.org/doc/data_prep.html
for more information.
utils/validate_data_dir.sh: Successfully validated data-directory dump/raw/dev
fix_data_dir.sh: kept all 250 utterances.
fix_data_dir.sh: old files are kept in dump/raw/dev/.backup
2025-02-17T15:20:55 (tts.sh:523:main) Stage 4: Generate token_list from data/tr_no_dev/text
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data] Unzipping corpora/cmudict.zip.
/usr/bin/python3 /work/espnet/espnet2/bin/tokenize_text.py --token_type phn -f 2- --input dump/raw/srctexts --output dump/token_list/phn_tacotron_g2p_en_no_space/tokens.txt --non_linguistic_symbols none --cleaner tacotron --g2p g2p_en_no_space --write_vocabularytrue--add_symbol'<blank>:0'--add_symbol'<unk>:1'--add_symbol'<sos/eos>:-1'
2025-02-17 15:21:10,513(tokenize_text:174) INFO: OOV rate =0.0 %
2025-02-17T15:21:10 (tts.sh:907:main) Skip training stages
2025-02-17T15:21:10 (tts.sh:912:main) Use imdanboy/jets for decoding and evaluation
(â¦)2p_en_no_space%2Ftrain%2Fpitch_stats.npz: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|770/770[00:00<00:00, 266kB/s](â¦)e%2Fimages%2Fdiscriminator_fake_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|75.2k/75.2k [00:00<00:00, 4.79MB/s](â¦)p_en_no_space%2Ftrain%2Fenergy_stats.npz: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|770/770[00:00<00:00, 238kB/s](â¦)images%2Fdiscriminator_backward_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|73.7k/73.7k [00:00<00:00, 5.66MB/s]
README.md: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|11.6k/11.6k [00:00<00:00, 4.72MB/s](â¦)n_tacotron_g2p_en_no_space%2Fconfig.yaml: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|9.54k/9.54k [00:00<00:00, 3.15MB/s](â¦)2p_en_no_space%2Ftrain%2Ffeats_stats.npz: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|1.40k/1.40k [00:00<00:00, 481kB/s]
.gitattributes: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|1.17k/1.17k [00:00<00:00, 426kB/s](â¦)ages%2Fdiscriminator_optim_step_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|36.0k/36.0k [00:00<00:00, 11.1MB/s](â¦)_space%2Fimages%2Fdiscriminator_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|75.1k/75.1k [00:00<00:00, 16.8MB/s](â¦)%2Fimages%2Fdiscriminator_train_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|74.1k/74.1k [00:00<00:00, 19.7MB/s](â¦)es%2Fgenerator_align_forwardsum_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|32.7k/32.7k [00:00<00:00, 26.1MB/s](â¦)%2Fimages%2Fgenerator_align_bin_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|32.8k/32.8k [00:00<00:00, 23.1MB/s](â¦)pace%2Fimages%2Fgenerator_align_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|31.7k/31.7k [00:00<00:00, 23.5MB/s](â¦)e%2Fimages%2Fgenerator_backward_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|76.1k/76.1k [00:00<00:00, 29.8MB/s](â¦)ce%2Fimages%2Fgenerator_forward_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|27.8k/27.8k [00:00<00:00, 25.9MB/s](â¦)Fimages%2Fdiscriminator_forward_time.png: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|76.3k/76.3k [00:00<00:00, 533kB/s](â¦)e%2Fimages%2Fdiscriminator_real_loss.png: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|75.7k/75.7k [00:00<00:00, 505kB/s](â¦)pace%2Fimages%2Fgenerator_g_adv_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|65.5k/65.5k [00:00<00:00, 45.3MB/s](â¦)no_space%2Fimages%2Fgenerator_g_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|36.8k/36.8k [00:00<00:00, 40.3MB/s](â¦)pace%2Fimages%2Fgenerator_g_mel_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|33.9k/33.9k [00:00<00:00, 28.1MB/s](â¦)images%2Fgenerator_g_feat_match_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|44.5k/44.5k [00:00<00:00, 21.5MB/s](â¦)n_no_space%2Fimages%2Fgenerator_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|33.2k/33.2k [00:00<00:00, 28.5MB/s](â¦)2Fimages%2Fgenerator_optim_step_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|39.0k/39.0k [00:00<00:00, 35.3MB/s](â¦)2Fimages%2Fgenerator_var_energy_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|31.1k/31.1k [00:00<00:00, 27.1MB/s](â¦)ce%2Fimages%2Fgenerator_var_dur_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|39.5k/39.5k [00:00<00:00, 30.8MB/s](â¦)ace%2Fimages%2Fgpu_max_cached_mem_GB.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|32.8k/32.8k [00:00<00:00, 22.4MB/s](â¦)_space%2Fimages%2Fgenerator_var_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|30.9k/30.9k [00:00<00:00, 10.4MB/s](â¦)%2Fimages%2Fgenerator_var_pitch_loss.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|35.4k/35.4k [00:00<00:00, 14.0MB/s](â¦)pace%2Fimages%2Fgenerator_train_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|30.3k/30.3k [00:00<00:00, 33.5MB/s](â¦)2p_en_no_space%2Fimages%2Foptim0_lr0.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|25.2k/25.2k [00:00<00:00, 22.6MB/s](â¦)2p_en_no_space%2Fimages%2Foptim1_lr0.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|25.0k/25.0k [00:00<00:00, 19.9MB/s](â¦)2p_en_no_space%2Fimages%2Ftrain_time.png: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|36.2k/36.2k [00:00<00:00, 27.2MB/s](â¦)g2p_en_no_space%2Fimages%2Fiter_time.png: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|85.4k/85.4k [00:00<00:00, 592kB/s]
meta.yaml: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|326/326[00:00<00:00, 319kB/s]
train.total_count.ave_5best.pth: 100%|âââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ| 334M/334M [00:09<00:00, 33.6MB/s]
Fetching 36 files: 100%|ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ|36/36[00:12<00:00, 2.90it/s]
2025-02-17T15:21:24 (tts.sh:933:main) Stage 7: Decoding: training_dir=exp/imdanboy/jets
2025-02-17T15:21:24 (tts.sh:956:main) Generate 'exp/imdanboy/jets/decode_train.loss.ave/run.sh'. You can resume the process from stage 7 using this script
2025-02-17T15:21:25 (tts.sh:1010:main) Decoding started... log: 'exp/imdanboy/jets/decode_train.loss.ave/dev/log/tts_inference.*.log'
2025-02-17T15:30:33 (tts.sh:1010:main) Decoding started... log: 'exp/imdanboy/jets/decode_train.loss.ave/eval1/log/tts_inference.*.log'
2025-02-17T15:40:02 (tts.sh:1180:main) Skip the uploading stage
2025-02-17T15:40:02 (tts.sh:1232:main) Skip the uploading to HuggingFace stage
2025-02-17T15:40:02 (tts.sh:1235:main) Successfully finished. [elapsed=1741s]