necklace
/
sekai
forked from isengaara/sekai


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
							#include <sekai/UnisynIndex.h>
#include <sekai/MBRSynth.h>
#include <sndfile.h>
#include <assert.h>

#include <string>

#include <fstream>
#include <iostream>
#include <sstream>

#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/lexical_cast.hpp>

//must not be hardcoded
std::string voicename = "krb";
std::string voicedir = "/home/isengaara/Hacking/Audio/VoiceSynth/FESTIVAL/festival-czech-voices/voice-czech-"+voicename+"/";
std::string indexfile = "dic/"+voicename+"diph.est";
std::string wavfile = voicename+".wav";

//std::string phofile = "/home/isengaara/Hacking/Audio/VoiceSynth/MBROLA/MBROLASing/Ukazky/Komplet/Prodanka_1-0_2-0.pho";
std::string phofile = "/home/isengaara/Hacking/Audio/VoiceSynth/MBROLA/MBROLASing/Ukazky/Komplet/Prodanka_3-0_4-0.pho";

struct phone
{
    std::string name;
    float length;
    float start;
};

struct diphone
{
    std::string basename;
};

std::vector<phone> phones;
UnisynIndex* voiceindex=nullptr;
std::vector<diphone> diphones;
ControlTrack* ctrl=nullptr;

std::string subst(std::string pho)
{
    if(pho=="x") return "ch";
    if(pho=="D") return "d";
    if(pho=="N") return "n";
    if(pho=="Z") return "z";
    return pho;
}

void readPho(std::string fileName) {
  std::ifstream infile(fileName);
  float pos=0;
  std::string line;
  bool firstPitch=true;
  float pitchFrq=0;
  ctrl = new ControlTrack();
  float last_pos2=0;
  while (std::getline(infile, line)) {
    if(line.length())
    {
      std::vector<std::string> spl;
      boost::split(spl, line, boost::is_any_of("\t "),
                   boost::token_compress_on);
      
        std::string pho = spl[0];
        printf("spl[1]=%s\n",spl[1].c_str());
        float length = boost::lexical_cast<float>(spl[1])/1000;
        phone p;
        p.start = pos;
        p.length = length;
        p.name = pho;
        phones.push_back(p);
        for(uint i=2;i<spl.size();i+=2)
        {      
            if(spl[i].size()==0) {
                printf("error 1");
                break;
            }
            if(spl[i+1].size()==0) {
                printf("error 2");
                break;
            }
                           
            printf("spl[%i]=%s\n",i,spl[i].c_str());
            printf("spl[%i]=%s\n",i+1,spl[i+1].c_str());
            float percent = boost::lexical_cast<float>(spl[i]);
            float frq = boost::lexical_cast<float>(spl[i+1]);
            if(firstPitch)
            {
                ctrl->addPoint(0,frq);
                firstPitch=false;
            }
            else
            {
                pitchFrq=frq;
            }
            float pos2=(pos+length*percent/100); 
            ctrl->addPoint(pos2,frq);
            if(last_pos2==pos2)
            {
                float pos3=pos2+0.00000005;
               // printf("pos2=%f pos3=%f\n",pos2,pos3);
                ctrl->addPoint(pos3,frq);
            }
            else
            {
               // printf("pos2=%f\n",pos2);
                ctrl->addPoint(pos,frq);
            }
            
            
            last_pos2=pos;
        }
        pos+=length;
        
    }
  }
  ctrl->addPoint(pos,pitchFrq);
  ctrl->addPoint(pos+1.0,pitchFrq);
  ctrl->fix();
}

//params needed:
// voicepath
// synthExt
// synthDefParams(mbr)

void generate(MBRSynth *synth)
{
    float end = 0;
    int sz = phones.size();
    printf("size=%i\n",sz);
    assert(sz>2);
    for(uint i=0;i<phones.size()-1;i++)
    {
        printf("i=%i\n",i);
        phone p0 = phones[i];
        phone p1 = phones[i+1];
        std::string diph = subst(p0.name)+"-"+subst(p1.name);
        auto found = voiceindex->getPho(diph);
        
        if(found.basename.size()> 0)
        {
            printf("found\n");
            //printf("%s -> %s\n",diph.c_str(),found.basename.c_str());
            diphone d;
            d.basename = found.basename;
            diphones.push_back(d);
            //printf("    pho: %f %f %f\n",p0.start,p1.start,p1.start+p1.length);
            //printf("    vox: %f %f %f\n",found.alignment[0],found.alignment[1],found.alignment[2]);
            float x[3] = {p0.start,p1.start,p1.start+p1.length};
            if(end)
            {
                float length = x[2]-x[1];
                float concat = length/2;
                
                //TODO: lookup concat somewhere
                if(concat>0.050) concat=0.050;
                printf("concat %f %s\n",concat,diph.c_str());
                assert(concat<length);
                x[1] = x[2] - concat;
            }
            end = x[3];
            float y[3] = {found.alignment[0],found.alignment[1],found.alignment[2]};
            synth->addUnit(found.basename,3,x,y);
            //std::string ogg = voicedir + "ogg/"+found.basename+".ogg";
            //printf("%s\n",ogg.c_str());
            //TODO:: meta info to emit
        }
        else
        {
            printf("not found\n");
            end=0;
        }
    }
}


int main()
{
    printf("readpho\n");
    readPho(phofile);     
    printf("create unisyn index\n");
    voiceindex = new UnisynIndex();
    voiceindex->readFromFile(voicedir+indexfile);
    printf("create synth\n");
    
    MBRConfig config;
    config.type=synthType::FVOX;
    config.mbr_period=0;
    config.frame_period=0;
    config.fft_size=0;
  
    MBRSynth *synth = new MBRSynth(ctrl);
    synth->setBasedir(voicedir);
    synth->setConfig(&config);
    
    //TODO: load ogg files / shared for UTAU and Unisyn
    printf("generate\n");
    generate(synth);
    printf("run\n");


  SF_INFO info = {0};
  info.samplerate = 32000;
  info.channels = 1;
  info.format = SF_FORMAT_WAV | SF_FORMAT_PCM_16;
  SNDFILE *sf = sf_open(wavfile.c_str(), SFM_WRITE, &info);

  while (1) {
    const int size = 1024;
    int fill = size * 4;

    float buffer_out[size];
    if (synth->readData(buffer_out, size, fill) == false) break;
    sf_write_float(sf, buffer_out, size);
  }

  sf_close(sf);

    
}