随着AI技术的发展,语音克隆技术也受到越来越多的关注。目前,AI语音克隆技术已经可以实现模拟出一个人的声音,包括音色、语言习惯和情绪等。
AI合成郭老师英语接受专访
1
MockingBird
-
项目地址: https://github.com/babysor/MockingBird
-
论文地址: https://arxiv.org/pdf/1806.04558.pdf
项目介绍
环境搭建
Ubuntu-22.04
python 3.7.12
pytorch 1.8.0
安装ffmpeg
pip install -r requirements.txt
使用
import os
import sys
import typer
cli = typer.Typer()
@cli.command()
def launch(port: int = typer.Option(8080, "--port", "-p")) -> None:
"""Start a graphical UI server for the opyrator.
The UI is auto-generated from the input- and output-schema of the given
function.
"""
# Add the current working directory to the sys path
# This is required to resolve the opyrator path
sys.path.append(os.getcwd())
from control.mkgui.base.ui.streamlit_ui import launch_ui
launch_ui(port)
if __name__ == "__main__":
cli()
STREAMLIT_RUNNER_SNIPPET = """
from control.mkgui.base.ui import render_streamlit_ui
import streamlit as st
# TODO: Make it configurable
# Page config can only be setup once
st.set_page_config(
page_title="MockingBird",
page_icon="🧊",
layout="wide")
render_streamlit_ui()
"""
def launch_ui(port: int = 8501) -> None:
with NamedTemporaryFile(
suffix=".py", mode="w", encoding="utf-8", delete=False
) as f:
f.write(STREAMLIT_RUNNER_SNIPPET)
f.seek(0)
import subprocess
python_path = f'PYTHONPATH="$PYTHONPATH:{getcwd()}"'
if system() == "Windows":
python_path = f"set PYTHONPATH=%PYTHONPATH%;{getcwd()} &&"
subprocess.run(
f"""set
STREAMLIT_GLOBAL_SHOW_WARNING_ON_DIRECT_EXECUTION=false""",
shell=True,
)
subprocess.run(
f"""{python_path} "{sys.executable}" -m streamlit run --server.port=
{port} --server.headless=True --runner.magicEnabled=False --
server.maxUploadSize=50 --browser.gatherUsageStats=False {f.name}""",
shell=True,
)
f.close()
unlink(f.name)
微调流程
# 音频预处理
sample_rate = 16000
max_mel_frames = 900
rescaling_max = 0.9
synthesis_batch_size = 16
fmax = 7600
allow_clipping_in_normalization = True
# 训练
tts_schedule = [(2, 1e-3, 10_000, 12),
(2, 5e-4, 15_000, 12),
(2, 2e-4, 20_000, 12),
(2, 1e-4, 30_000, 12),
(2, 5e-5, 40_000, 12),
(2, 1e-5, 60_000, 12),
(2, 5e-6, 160_000, 12),
(2, 3e-6, 320_000, 12),
(2, 1e-6, 640_000, 12)]
问题记录
2
Clone-voice
项目地址:
项目介绍
环境
python 3.9.18
torch 2.1.1
torchaudio 2.1.1
TTS 0.20.6
gradio 4.5.0
使用
3
Coqui XTTS
-
项目地址: https://github.com/coqui-ai/TTS
-
文档: https://docs.coqui.ai/en/dev/models/xtts.html
-
huggingface: https://huggingface.co/coqui/XTTS-v2
-
paper: https://arxiv.org/abs/2112.02418
项目介绍
使用
import torch
from TTS.api import TTS
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# List available 🐸TTS models
print(TTS().list_models())
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the
target speaker_wav and language
# Text to speech list of amplitude values as output
wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav",
language="en")
# Text to speech to a file
tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav",
language="en", file_path="output.wav")
代码
import os
import glob
import gradio as gr
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import torch
import torchaudio
# DIR
ROOT_DIR=os.getcwd()
print(ROOT_DIR)
os.environ['TTS_HOME']=ROOT_DIR
TMP_DIR=os.path.join(ROOT_DIR, "static/tmp")
VOICE_DIR=os.path.join(ROOT_DIR, "static/voicelist")
OUTPUT = os.path.join(ROOT_DIR, "static/tmp/output.wav")
RECORD = os.path.join(TMP_DIR, "tmp.wav")
# Get device
device = "cuda:1" if torch.cuda.is_available() else "cpu"
print("Loading model...")
config = XttsConfig()
config.load_json(os.path.join(ROOT_DIR,"tts/tts_models--multilingual--multidataset--xtts_v2/config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(config,
checkpoint_dir=os.path.join(ROOT_DIR,"tts/tts_models--multilingual--multidataset--xtts_v2/"), use_deepspeed=False)
model.to(device)
model.eval()
"""
utilities
"""
# 从./static/voicelist/得到已有的wav文件
def get_wav_to_Dropdown():
wavs = glob.glob(f"{VOICE_DIR}/*.wav")
result = []
for it in wavs:
if os.path.getsize(it) > 0:
result.append(tuple([os.path.basename(it), it]))
return result
# 播放现有音频
def display_wav(speaker_wav):
return speaker_wav
"""
text: 需要被转换的文本
speaker_wav: cloned voice
record_audio: 上传或记录音频,优先使用这部分的音频
"""
def text_to_speech(text, speaker_wav, record_audio, temperature):
if record_audio is None:
print("Computing speaker latents...")
speaker_wav = os.path.join(VOICE_DIR, speaker_wav)
gpt_cond_latent, speaker_embedding =
model.get_conditioning_latents(audio_path=[speaker_wav])
print("Inference...")
out = model.inference(
text,
"zh-cn",
gpt_cond_latent,
speaker_embedding,
temperature=temperature, # Add custom parameters here
)
torchaudio.save(OUTPUT, torch.tensor(out["wav"]).unsqueeze(0), 24000)
return OUTPUT
else:
sr = record_audio[0]
torchaudio.save(RECORD, torch.tensor(record_audio[1]).unsqueeze(0), sr)
print("Computing speaker latents...")
speaker_wav = os.path.join(VOICE_DIR, RECORD)
gpt_cond_latent, speaker_embedding =
model.get_conditioning_latents(audio_path=[speaker_wav])
print("Inference...")
out = model.inference(
text,
"zh-cn",
gpt_cond_latent,
speaker_embedding,
temperature=temperature # Add custom parameters here
)
torchaudio.save(OUTPUT, torch.tensor(out["wav"]).unsqueeze(0), 24000)
return OUTPUT
with gr.Blocks() as demo:
with gr.Row():
# 左侧输入
with gr.Column():
txt = gr.Textbox(value="你好,我是一个文字转语音测试。", label="Input:",
lines=1, placeholder="请输入待语音克隆的文字")
temperature = gr.Slider(label="temperature:", minimum=0, maximum=1,
value=0.7)
with gr.Row():
speaker_wavs = gr.Dropdown(label="Speaker wavs:",
choices=get_wav_to_Dropdown())
btn_display = gr.Button(value="播放已选音频", scale=0.25)
display_au = gr.Audio(visible=False, autoplay=True)
if speaker_wavs:
btn_display.click(display_wav,inputs=[speaker_wavs],outputs=
[display_au])
record_au = gr.Audio(label="录制或上传语音:", format="wav",
show_download_button=True)
with gr.Row():
clear_btn = gr.ClearButton()
gen_btn = gr.Button(value="生成")
# 右侧输出
with gr.Column():
output_au = gr.Audio(label="语音克隆结果: ", show_download_button=True)
gen_btn.click(text_to_speech, inputs=[txt, speaker_wavs, record_au,
temperature], outputs=[output_au])
问题记录
1. 模型位置
def get_user_data_dir(appname):
TTS_HOME = os.environ.get("TTS_HOME")
XDG_DATA_HOME = os.environ.get("XDG_DATA_HOME")
if TTS_HOME is not None:
ans = Path(TTS_HOME).expanduser().resolve(strict=False)
elif XDG_DATA_HOME is not None:
ans = Path(XDG_DATA_HOME).expanduser().resolve(strict=False)
elif sys.platform == "win32":
import winreg
key = winreg.OpenKey(
winreg.HKEY_CURRENT_USER,
r"SoftwareMicrosoftWindowsCurrentVersionExplorerShell Folders"
)
dir_, _ = winreg.QueryValueEx(key, "Local AppData")
ans = Path(dir_).resolve(strict=False)
elif sys.platform == "darwin":
ans = Path("~/Library/Application Support/").expanduser()
else:
ans = Path.home().joinpath(".local/share") # 如果没有设置
os.environ['TTS_HOME'],就下载到share中
return ans.joinpath(appname)
2. 环境兼容
3. 临时域名访问
4. 长文本输入
4
总结与思考
从我们的实践来看,语音生成效果已经取得长足的进步,zero-shot语音克隆的效果也基本满意,但由于语音模型预训练数据分布的问题,大规模、高质量的中文开源预训练数据集较少,因此与英文相比,实时中文语音克隆的效果会差一些。此外,流式语音克隆是大势所趋,实时的输入音频,即时地生成语音,需要进一步优化推理所需的资源和生成语音的延迟。
作者:obliviate(云腾实验室)
编辑:小椰风
更多阅读
原文始发于微信公众号(安全极客):AI实践 | 一文简述语音克隆实践
- 左青龙
- 微信扫一扫
- 右白虎
- 微信扫一扫
评论