123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- import { env, Tensor, AutoTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan } from '@xenova/transformers';
- import { encodeWAV } from './utils';
- // Disable local model checks
- env.allowLocalModels = false;
- // Use the Singleton pattern to enable lazy construction of the pipeline.
- class MyTextToSpeechPipeline {
- static BASE_URL = 'https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/';
- static model_id = 'Xenova/speecht5_tts';
- static vocoder_id = 'Xenova/speecht5_hifigan';
- static tokenizer_instance = null;
- static model_instance = null;
- static vocoder_instance = null;
- static async getInstance(progress_callback = null) {
- if (this.tokenizer_instance === null) {
- this.tokenizer = AutoTokenizer.from_pretrained(this.model_id, { progress_callback });
- }
- if (this.model_instance === null) {
- this.model_instance = SpeechT5ForTextToSpeech.from_pretrained(this.model_id, {
- quantized: false,
- progress_callback,
- });
- }
- if (this.vocoder_instance === null) {
- this.vocoder_instance = SpeechT5HifiGan.from_pretrained(this.vocoder_id, {
- quantized: false,
- progress_callback,
- });
- }
- return new Promise(async (resolve, reject) => {
- const result = await Promise.all([
- this.tokenizer,
- this.model_instance,
- this.vocoder_instance,
- ]);
- self.postMessage({
- status: 'ready',
- });
- resolve(result);
- });
- }
- static async getSpeakerEmbeddings(speaker_id) {
- // e.g., `cmu_us_awb_arctic-wav-arctic_a0001`
- const speaker_embeddings_url = `${this.BASE_URL}${speaker_id}.bin`;
- const speaker_embeddings = new Tensor(
- 'float32',
- new Float32Array(await (await fetch(speaker_embeddings_url)).arrayBuffer()),
- [1, 512]
- )
- return speaker_embeddings;
- }
- }
- // Mapping of cached speaker embeddings
- const speaker_embeddings_cache = new Map();
- // Listen for messages from the main thread
- self.addEventListener('message', async (event) => {
- // Load the pipeline
- const [tokenizer, model, vocoder] = await MyTextToSpeechPipeline.getInstance(x => {
- // We also add a progress callback so that we can track model loading.
- self.postMessage(x);
- });
- // Tokenize the input
- const { input_ids } = tokenizer(event.data.text);
- // Load the speaker embeddings
- let speaker_embeddings = speaker_embeddings_cache.get(event.data.speaker_id);
- if (speaker_embeddings === undefined) {
- speaker_embeddings = await MyTextToSpeechPipeline.getSpeakerEmbeddings(event.data.speaker_id);
- speaker_embeddings_cache.set(event.data.speaker_id, speaker_embeddings);
- }
- // Generate the waveform
- let response;
- try {
- response = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
- } catch(e) {
- self.postMessage({
- status: 'error',
- exception: e,
- });
- throw e;
- }
- const { waveform } = response;
- // Encode the waveform as a WAV file
- const wav = encodeWAV(waveform.data);
- // Send the output back to the main thread
- self.postMessage({
- status: 'complete',
- output: new Blob([wav], { type: 'audio/wav' }),
- });
- });
|