sinsy.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. /* ----------------------------------------------------------------- */
  2. /* The HMM-Based Singing Voice Synthesis System "Sinsy" */
  3. /* developed by Sinsy Working Group */
  4. /* http://sinsy.sourceforge.net/ */
  5. /* ----------------------------------------------------------------- */
  6. /* */
  7. /* Copyright (c) 2009-2015 Nagoya Institute of Technology */
  8. /* Department of Computer Science */
  9. /* */
  10. /* All rights reserved. */
  11. /* */
  12. /* Redistribution and use in source and binary forms, with or */
  13. /* without modification, are permitted provided that the following */
  14. /* conditions are met: */
  15. /* */
  16. /* - Redistributions of source code must retain the above copyright */
  17. /* notice, this list of conditions and the following disclaimer. */
  18. /* - Redistributions in binary form must reproduce the above */
  19. /* copyright notice, this list of conditions and the following */
  20. /* disclaimer in the documentation and/or other materials provided */
  21. /* with the distribution. */
  22. /* - Neither the name of the Sinsy working group nor the names of */
  23. /* its contributors may be used to endorse or promote products */
  24. /* derived from this software without specific prior written */
  25. /* permission. */
  26. /* */
  27. /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
  28. /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  29. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  30. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  31. /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
  32. /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
  33. /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
  34. /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
  35. /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
  36. /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
  37. /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
  38. /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  39. /* POSSIBILITY OF SUCH DAMAGE. */
  40. /* ----------------------------------------------------------------- */
  41. #include <string.h>
  42. #include <iostream>
  43. #include <fstream>
  44. #include <sstream>
  45. #include "sinsy.h"
  46. #include <espeak-ng/espeak_ng.h>
  47. #include <espeak-ng/speak_lib.h>
  48. #include <espeak-ng/encoding.h>
  49. #include <sndfile.h>
  50. #include <samplerate.h>
  51. #include <assert.h>
  52. #include <ringbuffer.h>
  53. #include <unistd.h>
  54. using namespace std;
  55. namespace
  56. {
  57. //const char* DEFAULT_LANGS = "fi";
  58. };
  59. void usage()
  60. {
  61. #ifdef HAVE_HTS
  62. std::cout << "The HMM-Based Singing Voice Syntheis System \"Sinsy\"" << std::endl;
  63. #else
  64. std::cout << "The Formant-Based Singing Voice Syntheis System \"SinsyNG\"" << std::endl;
  65. #endif
  66. std::cout << "Version 0.92 (http://sinsy.sourceforge.net/)" << std::endl;
  67. std::cout << "Copyright (C) 2009-2015 Nagoya Institute of Technology" << std::endl;
  68. std::cout << "All rights reserved." << std::endl;
  69. std::cout << "" << std::endl;
  70. #ifdef HAVE_HTS
  71. std::cout << "The HMM-Based Speech Synthesis Engine \"hts_engine API\"" << std::endl;
  72. std::cout << "Version 1.10 (http://hts-engine.sourceforge.net/)" << std::endl;
  73. std::cout << "Copyright (C) 2001-2015 Nagoya Institute of Technology" << std::endl;
  74. std::cout << " 2001-2008 Tokyo Institute of Technology" << std::endl;
  75. std::cout << "All rights reserved." << std::endl;
  76. std::cout << "" << std::endl;
  77. std::cout << "sinsy - The HMM-based singing voice synthesis system \"Sinsy\"" << std::endl;
  78. #else
  79. std::cout << "The eSpeak NG (Next Generation) Text-to-Speech Synthesis Engine" << std::endl;
  80. std::cout << "Copyright (C) 2005-2014 Jonathan Duddington" << std::endl;
  81. std::cout << " 2015-2017 Reece H. Dunn" << std::endl;
  82. #endif
  83. std::cout << "" << std::endl;
  84. std::cout << " usage:" << std::endl;
  85. std::cout << " sinsy [ options ] [ infile ]" << std::endl;
  86. std::cout << " options: [def]" << std::endl;
  87. #ifdef HAVE_HTS
  88. std::cout << " -w langs : languages [ j]" << std::endl;
  89. std::cout << " j: Japanese " << std::endl;
  90. std::cout << " (Currently, you can set only Japanese) " << std::endl;
  91. #else
  92. std::cout << " -w langs : languages [ en]" << std::endl;
  93. #endif
  94. #ifdef HAVE_HTS
  95. std::cout << " -x dir : dictionary directory [N/A]" << std::endl;
  96. std::cout << " -m htsvoice : HTS voice file [N/A]" << std::endl;
  97. #endif
  98. std::cout << " -o file : filename of output wav audio [N/A]" << std::endl;
  99. std::cout << " infile:" << std::endl;
  100. std::cout << " MusicXML file" << std::endl;
  101. }
  102. static int SynthCallback(short *wav, int numsamples, espeak_EVENT *events)
  103. {
  104. //dummy
  105. return 0;
  106. }
  107. class ECantorix : public sinsy::IScore
  108. {
  109. int inputSamplerate=0;
  110. int outputSamplerate=0;
  111. bool inTie=false;
  112. size_t tieDuration=0;
  113. std::string tieLyrics;
  114. float* resample(float* data_in,int input_frames,float ratio,int *output_frames)
  115. {
  116. *output_frames = input_frames*ratio;
  117. float* ret = (float*)malloc(sizeof(float)*output_frames[0]);
  118. SRC_DATA data;
  119. data.data_in = data_in;
  120. data.data_out = ret;
  121. data.input_frames = input_frames;
  122. data.output_frames = *output_frames;
  123. data.src_ratio = ratio;
  124. src_simple(&data,SRC_SINC_BEST_QUALITY,1);
  125. return ret;
  126. }
  127. public:
  128. ECantorix(){
  129. }
  130. virtual ~ECantorix() {
  131. }
  132. void init()
  133. {
  134. char *data_path = NULL; // use default path for espeak-ng-data
  135. espeak_ng_InitializePath(data_path);
  136. espeak_ng_ERROR_CONTEXT context = NULL;
  137. espeak_ng_STATUS result = espeak_ng_Initialize(&context);
  138. if (result != ENS_OK) {
  139. espeak_ng_PrintStatusCodeMessage(result, stderr, context);
  140. espeak_ng_ClearErrorContext(&context);
  141. exit(1);
  142. }
  143. sinsy_ng_Init();
  144. result = espeak_ng_InitializeOutput(ENOUTPUT_MODE_SYNCHRONOUS, 0, NULL);
  145. espeak_SetSynthCallback(SynthCallback);//dummy synth callback
  146. inputSamplerate = espeak_ng_GetSampleRate();
  147. }
  148. void setOutputSamplerate(int fs)
  149. {
  150. outputSamplerate = fs;
  151. }
  152. void setVoiceByName(const std::string& voicename)
  153. {
  154. espeak_ng_STATUS result = espeak_ng_SetVoiceByName(voicename.c_str());
  155. if (result != ENS_OK) {
  156. fprintf(stderr,"ESPEAK_ERROR voice %s not found\n",voicename.c_str());
  157. exit(1);
  158. }
  159. }
  160. //! set encoding
  161. virtual bool setEncoding(const std::string& encoding)
  162. {
  163. if(encoding=="utf-8") return true;
  164. fprintf(stderr,"setEncoding %s\n",encoding.c_str());
  165. return false;
  166. }
  167. //! add key mark
  168. virtual bool addKeyMark(sinsy::ModeType modeType, int fifths)
  169. {
  170. fprintf(stderr,"addKeyMark %i %i\n",(int)modeType,fifths);
  171. return true;
  172. }
  173. //! add beat mark (beats/beatType) to end of score: default beat mark is 4/4
  174. virtual bool addBeatMark(size_t beats, size_t beatType)
  175. {
  176. fprintf(stderr,"addBeatMark %i %i\n",(int)beats,(int)beatType);
  177. return true;
  178. }
  179. //! add tempo mark to end of score: default tempo is 100bps
  180. virtual bool addTempoMark(double tempo)
  181. {
  182. fprintf(stderr,"addTempoMark %f\n",tempo);
  183. return true;
  184. }
  185. //! add dynamics mark (sudden changes) to end of score
  186. virtual bool addSuddenDynamicsMark(sinsy::SuddenDynamicsType suddenDynamicsType)
  187. {
  188. fprintf(stderr,"addSuddenDynamicsMark %i\n",(int)suddenDynamicsType);
  189. return true;
  190. }
  191. //! add dynamics mark (gradual changes) to end of score
  192. virtual bool addGradualDynamicsMark(sinsy::GradualDynamicsType gradualDynamicsType)
  193. {
  194. fprintf(stderr,"addGradualDynamicsMark %i\n",(int)gradualDynamicsType);
  195. return true;
  196. }
  197. //! add note to end of score
  198. virtual bool addNote(size_t duration, const std::string& lyric, size_t pitch, bool accent, bool staccato, sinsy::TieType tieType, sinsy::SlurType slurType, sinsy::SyllabicType syllabicType, bool breath = false)
  199. {
  200. if(tieType==sinsy::TIETYPE_BEGIN) inTie=true;
  201. if(inTie) {
  202. tieLyrics+=lyric;
  203. tieDuration+=duration;
  204. }
  205. if(!inTie) sinsy_ng_addNote(duration,lyric.c_str(),pitch,accent,staccato,slurType,syllabicType,breath);
  206. if(tieType==sinsy::TIETYPE_END)
  207. {
  208. sinsy_ng_addNote(tieDuration,tieLyrics.c_str(),pitch,accent,staccato,slurType,syllabicType,breath);
  209. tieDuration=0;
  210. tieLyrics="";
  211. inTie=false;
  212. }
  213. //fprintf(stderr,"addNote %i [%s] pitch=%i accent=%i staccato=%i tieType=%i slurType=%i syllabicType=%i breath=%i\n",duration,lyric.c_str(),pitch,accent,staccato,tieType,slurType,syllabicType,breath);
  214. return true;
  215. }
  216. //! add rest to end of score
  217. virtual bool addRest(size_t duration)
  218. {
  219. if(inTie)
  220. {
  221. fprintf(stderr,"cannot tie rests\n");
  222. exit(1);
  223. }
  224. sinsy_ng_addRest(duration);
  225. // fprintf(stderr,"addRest %i\n",duration);
  226. return true;
  227. }
  228. void saveTo(std::string fileName)
  229. {
  230. int data_length = 0;
  231. float* data2=0;
  232. float* data = sinsy_ng_getAudioData(&data_length);
  233. int fs=inputSamplerate;
  234. if(outputSamplerate!=0 && outputSamplerate!=inputSamplerate)
  235. {
  236. fs=outputSamplerate;
  237. int data_length2;
  238. data2 = data;
  239. float ratio = outputSamplerate*1.0/inputSamplerate;
  240. data = resample(data,data_length,ratio,&data_length2);
  241. fprintf(stderr,"resample %i %i\n",data_length,data_length2);
  242. data_length = data_length2;
  243. }
  244. SF_INFO info;
  245. memset(&info,0,sizeof(info));
  246. info.channels = 1;
  247. info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
  248. info.samplerate = fs;
  249. SNDFILE* sndfile = sf_open(fileName.c_str(),SFM_WRITE,&info);
  250. sf_write_float(sndfile,data,data_length);
  251. free(data);
  252. sf_close(sndfile);
  253. fprintf(stderr,"saving to %s length=%i\n",fileName.c_str(),data_length);
  254. if(data2) free(data2);
  255. }
  256. };
  257. ECantorix ecantorix;
  258. class CommandHandler
  259. {
  260. vector<string> argv;
  261. void rest()
  262. {
  263. if(argv.size()>1) ecantorix.addRest(atoi(argv[1].c_str()));
  264. }
  265. void note()
  266. {
  267. bool breath = false;
  268. if(argv.size()>9) breath = atoi(argv[9].c_str());
  269. if(argv.size()>8) ecantorix.addNote(atoi(argv[1].c_str()),argv[2],atoi(argv[3].c_str()),
  270. (bool)atoi(argv[4].c_str()),(bool)atoi(argv[5].c_str()),
  271. (sinsy::TieType)atoi(argv[6].c_str()),(sinsy::SlurType)atoi(argv[7].c_str()),(sinsy::SyllabicType)atoi(argv[8].c_str()),breath);
  272. }
  273. void resample()
  274. {
  275. if(argv.size()>1) ecantorix.setOutputSamplerate(atoi(argv[1].c_str()));
  276. }
  277. void voice()
  278. {
  279. if(argv.size()>1) ecantorix.setVoiceByName(argv[1]);
  280. }
  281. public:
  282. void parseCMD(const string& input)
  283. {
  284. std::string tmp;
  285. bool quote = false;
  286. for(int i=0;i<input.length();i++)
  287. {
  288. if(i==0 && input[i]=='#') return;//comment
  289. if(input[i]=='\"' && quote==false)
  290. {
  291. quote=true;
  292. }
  293. else if(input[i]=='\"' && quote==true)
  294. {
  295. quote=false;
  296. }
  297. else if(input[i]==' ' && quote==false)
  298. {
  299. if(tmp.length()) argv.push_back(tmp);
  300. tmp = "";
  301. }
  302. else if(i<input.length()-1 && input[i]=='\\')
  303. {
  304. i++;
  305. tmp += input[i];
  306. }
  307. else
  308. {
  309. tmp += input[i];
  310. }
  311. }
  312. if(tmp.length()) argv.push_back(tmp);
  313. string cmd = argv[0];
  314. if(cmd=="note") note();
  315. if(cmd=="rest") rest();
  316. if(cmd=="resample") resample();
  317. if(cmd=="voice") voice();
  318. argv.clear();
  319. }
  320. };
  321. int handleUScore(const std::string& uscore,const std::string& wav) {
  322. ecantorix.init();
  323. std::ifstream input(uscore);
  324. CommandHandler cmd;
  325. while(input) {
  326. string input_line;
  327. getline(input, input_line);
  328. cmd.parseCMD(input_line);
  329. };
  330. ecantorix.saveTo(wav);
  331. return 0;
  332. }
  333. int main(int argc, char **argv)
  334. {
  335. if (argc < 2) {
  336. usage();
  337. return -1;
  338. }
  339. std::string xml;
  340. std::string voice;
  341. #ifdef HAVE_HTS
  342. std::string config;
  343. #endif
  344. std::string wav;
  345. std::string languages;
  346. std::string uscore;
  347. voice = "en";
  348. int i(1);
  349. for(; i < argc; ++i) {
  350. if ('-' != argv[i][0]) {
  351. if (xml.empty()) {
  352. xml = argv[i];
  353. } else {
  354. std::cout << "[ERROR] invalid option : '" << argv[i][1] << "'" << std::endl;
  355. usage();
  356. return -1;
  357. }
  358. } else {
  359. switch (argv[i][1]) {
  360. case 'w' :
  361. languages = argv[++i];
  362. break;
  363. #ifdef HAVE_HTS
  364. case 'x' :
  365. config = argv[++i];
  366. break;
  367. #endif
  368. case 'm' :
  369. voice = argv[++i];
  370. break;
  371. case 'o' :
  372. wav = argv[++i];
  373. break;
  374. case 'u' :
  375. uscore = argv[++i];
  376. break;
  377. case 'h' :
  378. usage();
  379. return 0;
  380. default :
  381. std::cout << "[ERROR] invalid option : '-" << argv[i][1] << "'" << std::endl;
  382. usage();
  383. return -1;
  384. }
  385. }
  386. }
  387. if(uscore.size()) {
  388. return handleUScore(uscore,wav);
  389. }
  390. if(xml.empty() || voice.empty() || wav.empty()) {
  391. usage();
  392. return -1;
  393. }
  394. sinsy::Sinsy sinsy;
  395. std::vector<std::string> voices;
  396. voices.push_back(voice);
  397. #ifdef HAVE_HTS
  398. if (!sinsy.setLanguages(languages, config)) {
  399. std::cout << "[ERROR] failed to set languages : " << languages << ", config dir : " << config << std::endl;
  400. return -1;
  401. }
  402. if (!sinsy.loadVoices(voices)) {
  403. std::cout << "[ERROR] failed to load voices : " << voice << std::endl;
  404. return -1;
  405. }
  406. #endif
  407. if (!sinsy.loadScoreFromMusicXML(xml)) {
  408. std::cout << "[ERROR] failed to load score from MusicXML file : " << xml << std::endl;
  409. return -1;
  410. }
  411. #ifdef HAVE_HTS
  412. sinsy::SynthCondition condition;
  413. if (wav.empty()) {
  414. condition.setPlayFlag();
  415. } else {
  416. condition.setSaveFilePath(wav);
  417. }
  418. sinsy.synthesize(condition);
  419. #else
  420. ECantorix ecantorix;
  421. ecantorix.init();
  422. ecantorix.setVoiceByName(voice);
  423. sinsy.toScore(ecantorix);
  424. #if 1
  425. if (wav.empty()) {
  426. // use https://github.com/espeak-ng/pcaudiolib
  427. //ecantorix.play();//
  428. } else {
  429. ecantorix.saveTo(wav);
  430. }
  431. #endif
  432. // if(g_sndfile) sf_close(g_sndfile);
  433. #endif
  434. return 0;
  435. }