from voxcpm import VoxCPM
import soundfile as sf
import numpy as np
import librosa
model = VoxCPM.from_pretrained(
"openbmb/VoxCPM2",
load_denoiser=False,
)
segments = [
"One sentence",
"or more",
]
all_wavs = []
for seg in segments:
wav = model.generate(
text=seg, # add voice control "(speaking like whispering, slow and mysterious tone)" embedded to the each sentence/paragraph.
prompt_wav_path="path/to/10-30s/cloning.wav", # disable this if you want voice control
prompt_text="the cloning.wav transcript, punctuation matters.", # disable this if you want voice control
reference_wav_path="path/to/10-30s/cloning.wav", # same wav as prompt_wav_path
cfg_value=1.5, #1.5 - 2.0, 2.5 for voice control
inference_timesteps=10, # 6 fast, acceptable quality
normalize=True, # try False
retry_badcase=True # try False
)
wav_trimmed, _ = librosa.effects.trim(wav)
all_wavs.append(wav_trimmed)
full_wav = np.concatenate(all_wavs)
sf.write("hifi_clone.wav", full_wav, model.tts_model.sample_rate)
print("saved: hifi_clone.wav")
# uv pip install -r requirements.txt
torch==2.8.0+cu129
torchaudio==2.8.0+cu129
voxcpm==2.0.2
soundfile==0.13.1
uv pip install https://huggingface.co/ussoewwin/Flash-Attention-2_for_Windows/blob/main/flash_attn-2.8.2%2Bcu129torch2.8.0cxx11abiTRUE-cp312-cp312-win_amd64.whl
You will get some minor errors, handoff to your favorite coding ai, let it modify the source.
IMO voice control is better than providing transcript, not 100% cloning but the likeness is great.
Thanks to OpenBMB for releasing this open source, hope you can lower the memory consumption and provide faster generation like Kyutai Pocket TTS (CPP Version!).
You will get some minor errors, handoff to your favorite coding ai, let it modify the source.
IMO voice control is better than providing transcript, not 100% cloning but the likeness is great.
Thanks to OpenBMB for releasing this open source, hope you can lower the memory consumption and provide faster generation like Kyutai Pocket TTS (CPP Version!).