worker.js 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import { env, Tensor, AutoTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan } from '@xenova/transformers';
  2. import { encodeWAV } from './utils';
  3. // Disable local model checks
  4. env.allowLocalModels = false;
  5. // Use the Singleton pattern to enable lazy construction of the pipeline.
  6. class MyTextToSpeechPipeline {
  7. static BASE_URL = 'https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/';
  8. static model_id = 'Xenova/speecht5_tts';
  9. static vocoder_id = 'Xenova/speecht5_hifigan';
  10. static tokenizer_instance = null;
  11. static model_instance = null;
  12. static vocoder_instance = null;
  13. static async getInstance(progress_callback = null) {
  14. if (this.tokenizer_instance === null) {
  15. this.tokenizer = AutoTokenizer.from_pretrained(this.model_id, { progress_callback });
  16. }
  17. if (this.model_instance === null) {
  18. this.model_instance = SpeechT5ForTextToSpeech.from_pretrained(this.model_id, {
  19. quantized: false,
  20. progress_callback,
  21. });
  22. }
  23. if (this.vocoder_instance === null) {
  24. this.vocoder_instance = SpeechT5HifiGan.from_pretrained(this.vocoder_id, {
  25. quantized: false,
  26. progress_callback,
  27. });
  28. }
  29. return new Promise(async (resolve, reject) => {
  30. const result = await Promise.all([
  31. this.tokenizer,
  32. this.model_instance,
  33. this.vocoder_instance,
  34. ]);
  35. self.postMessage({
  36. status: 'ready',
  37. });
  38. resolve(result);
  39. });
  40. }
  41. static async getSpeakerEmbeddings(speaker_id) {
  42. // e.g., `cmu_us_awb_arctic-wav-arctic_a0001`
  43. const speaker_embeddings_url = `${this.BASE_URL}${speaker_id}.bin`;
  44. const speaker_embeddings = new Tensor(
  45. 'float32',
  46. new Float32Array(await (await fetch(speaker_embeddings_url)).arrayBuffer()),
  47. [1, 512]
  48. )
  49. return speaker_embeddings;
  50. }
  51. }
  52. // Mapping of cached speaker embeddings
  53. const speaker_embeddings_cache = new Map();
  54. // Listen for messages from the main thread
  55. self.addEventListener('message', async (event) => {
  56. // Load the pipeline
  57. const [tokenizer, model, vocoder] = await MyTextToSpeechPipeline.getInstance(x => {
  58. // We also add a progress callback so that we can track model loading.
  59. self.postMessage(x);
  60. });
  61. // Tokenize the input
  62. const { input_ids } = tokenizer(event.data.text);
  63. // Load the speaker embeddings
  64. let speaker_embeddings = speaker_embeddings_cache.get(event.data.speaker_id);
  65. if (speaker_embeddings === undefined) {
  66. speaker_embeddings = await MyTextToSpeechPipeline.getSpeakerEmbeddings(event.data.speaker_id);
  67. speaker_embeddings_cache.set(event.data.speaker_id, speaker_embeddings);
  68. }
  69. // Generate the waveform
  70. let response;
  71. try {
  72. response = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
  73. } catch(e) {
  74. self.postMessage({
  75. status: 'error',
  76. exception: e,
  77. });
  78. throw e;
  79. }
  80. const { waveform } = response;
  81. // Encode the waveform as a WAV file
  82. const wav = encodeWAV(waveform.data);
  83. // Send the output back to the main thread
  84. self.postMessage({
  85. status: 'complete',
  86. output: new Blob([wav], { type: 'audio/wav' }),
  87. });
  88. });