|
- #include <sekai/UnisynIndex.h>
- #include <sekai/MBRSynth.h>
- #include <sndfile.h>
- #include <assert.h>
- #include <string>
- #include <fstream>
- #include <iostream>
- #include <sstream>
- #include <boost/algorithm/string/classification.hpp>
- #include <boost/algorithm/string/split.hpp>
- #include <boost/lexical_cast.hpp>
- //must not be hardcoded
- std::string voicename = "krb";
- std::string voicedir = "/home/isengaara/Hacking/Audio/VoiceSynth/FESTIVAL/festival-czech-voices/voice-czech-"+voicename+"/";
- std::string indexfile = "dic/"+voicename+"diph.est";
- std::string wavfile = voicename+".wav";
- //std::string phofile = "/home/isengaara/Hacking/Audio/VoiceSynth/MBROLA/MBROLASing/Ukazky/Komplet/Prodanka_1-0_2-0.pho";
- std::string phofile = "/home/isengaara/Hacking/Audio/VoiceSynth/MBROLA/MBROLASing/Ukazky/Komplet/Prodanka_3-0_4-0.pho";
- struct phone
- {
- std::string name;
- float length;
- float start;
- };
- struct diphone
- {
- std::string basename;
- };
- std::vector<phone> phones;
- UnisynIndex* voiceindex=nullptr;
- std::vector<diphone> diphones;
- ControlTrack* ctrl=nullptr;
- std::string subst(std::string pho)
- {
- if(pho=="x") return "ch";
- if(pho=="D") return "d";
- if(pho=="N") return "n";
- if(pho=="Z") return "z";
- return pho;
- }
- void readPho(std::string fileName) {
- std::ifstream infile(fileName);
- float pos=0;
- std::string line;
- bool firstPitch=true;
- float pitchFrq=0;
- ctrl = new ControlTrack();
- float last_pos2=0;
- while (std::getline(infile, line)) {
- if(line.length())
- {
- std::vector<std::string> spl;
- boost::split(spl, line, boost::is_any_of("\t "),
- boost::token_compress_on);
-
- std::string pho = spl[0];
- printf("spl[1]=%s\n",spl[1].c_str());
- float length = boost::lexical_cast<float>(spl[1])/1000;
- phone p;
- p.start = pos;
- p.length = length;
- p.name = pho;
- phones.push_back(p);
- for(uint i=2;i<spl.size();i+=2)
- {
- if(spl[i].size()==0) {
- printf("error 1");
- break;
- }
- if(spl[i+1].size()==0) {
- printf("error 2");
- break;
- }
-
- printf("spl[%i]=%s\n",i,spl[i].c_str());
- printf("spl[%i]=%s\n",i+1,spl[i+1].c_str());
- float percent = boost::lexical_cast<float>(spl[i]);
- float frq = boost::lexical_cast<float>(spl[i+1]);
- if(firstPitch)
- {
- ctrl->addPoint(0,frq);
- firstPitch=false;
- }
- else
- {
- pitchFrq=frq;
- }
- float pos2=(pos+length*percent/100);
- ctrl->addPoint(pos2,frq);
- if(last_pos2==pos2)
- {
- float pos3=pos2+0.00000005;
- // printf("pos2=%f pos3=%f\n",pos2,pos3);
- ctrl->addPoint(pos3,frq);
- }
- else
- {
- // printf("pos2=%f\n",pos2);
- ctrl->addPoint(pos,frq);
- }
-
-
-
- last_pos2=pos;
- }
- pos+=length;
-
- }
- }
- ctrl->addPoint(pos,pitchFrq);
- ctrl->addPoint(pos+1.0,pitchFrq);
- ctrl->fix();
- }
- //params needed:
- // voicepath
- // synthExt
- // synthDefParams(mbr)
- void generate(MBRSynth *synth)
- {
- float end = 0;
- int sz = phones.size();
- printf("size=%i\n",sz);
- assert(sz>2);
- for(uint i=0;i<phones.size()-1;i++)
- {
- printf("i=%i\n",i);
- phone p0 = phones[i];
- phone p1 = phones[i+1];
- std::string diph = subst(p0.name)+"-"+subst(p1.name);
- auto found = voiceindex->getPho(diph);
-
- if(found.basename.size()> 0)
- {
- printf("found\n");
- //printf("%s -> %s\n",diph.c_str(),found.basename.c_str());
- diphone d;
- d.basename = found.basename;
- diphones.push_back(d);
- //printf(" pho: %f %f %f\n",p0.start,p1.start,p1.start+p1.length);
- //printf(" vox: %f %f %f\n",found.alignment[0],found.alignment[1],found.alignment[2]);
- float x[3] = {p0.start,p1.start,p1.start+p1.length};
- if(end)
- {
- float length = x[2]-x[1];
- float concat = length/2;
-
- //TODO: lookup concat somewhere
- if(concat>0.050) concat=0.050;
- printf("concat %f %s\n",concat,diph.c_str());
- assert(concat<length);
- x[1] = x[2] - concat;
- }
- end = x[3];
- float y[3] = {found.alignment[0],found.alignment[1],found.alignment[2]};
- synth->addUnit(found.basename,3,x,y);
- //std::string ogg = voicedir + "ogg/"+found.basename+".ogg";
- //printf("%s\n",ogg.c_str());
- //TODO:: meta info to emit
- }
- else
- {
- printf("not found\n");
- end=0;
- }
- }
- }
- int main()
- {
- printf("readpho\n");
- readPho(phofile);
- printf("create unisyn index\n");
- voiceindex = new UnisynIndex();
- voiceindex->readFromFile(voicedir+indexfile);
- printf("create synth\n");
-
- MBRConfig config;
- config.type=synthType::FVOX;
- config.mbr_period=0;
- config.frame_period=0;
- config.fft_size=0;
-
- MBRSynth *synth = new MBRSynth(ctrl);
- synth->setBasedir(voicedir);
- synth->setConfig(&config);
-
- //TODO: load ogg files / shared for UTAU and Unisyn
- printf("generate\n");
- generate(synth);
- printf("run\n");
- SF_INFO info = {0};
- info.samplerate = 32000;
- info.channels = 1;
- info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
- SNDFILE *sf = sf_open(wavfile.c_str(), SFM_WRITE, &info);
- while (1) {
- const int size = 1024;
- int fill = size * 4;
- float buffer_out[size];
- if (synth->readData(buffer_out, size, fill) == false) break;
- sf_write_float(sf, buffer_out, size);
- }
- sf_close(sf);
-
- }
|