123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491 |
- /*
- ** 2014 May 31
- **
- ** The author disclaims copyright to this source code. In place of
- ** a legal notice, here is a blessing:
- **
- ** May you do good and not evil.
- ** May you find forgiveness for yourself and forgive others.
- ** May you share freely, never taking more than you give.
- **
- ******************************************************************************
- */
- #include "fts5Int.h"
- /**************************************************************************
- ** Start of ascii tokenizer implementation.
- */
- /*
- ** For tokenizers with no "unicode" modifier, the set of token characters
- ** is the same as the set of ASCII range alphanumeric characters.
- */
- static unsigned char aAsciiTokenChar[128] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
- 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
- 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
- };
- typedef struct AsciiTokenizer AsciiTokenizer;
- struct AsciiTokenizer {
- unsigned char aTokenChar[128];
- };
- static void fts5AsciiAddExceptions(
- AsciiTokenizer *p,
- const char *zArg,
- int bTokenChars
- ){
- int i;
- for(i=0; zArg[i]; i++){
- if( (zArg[i] & 0x80)==0 ){
- p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
- }
- }
- }
- /*
- ** Delete a "ascii" tokenizer.
- */
- static void fts5AsciiDelete(Fts5Tokenizer *p){
- sqlite3_free(p);
- }
- /*
- ** Create an "ascii" tokenizer.
- */
- static int fts5AsciiCreate(
- void *pUnused,
- const char **azArg, int nArg,
- Fts5Tokenizer **ppOut
- ){
- int rc = SQLITE_OK;
- AsciiTokenizer *p = 0;
- UNUSED_PARAM(pUnused);
- if( nArg%2 ){
- rc = SQLITE_ERROR;
- }else{
- p = sqlite3_malloc(sizeof(AsciiTokenizer));
- if( p==0 ){
- rc = SQLITE_NOMEM;
- }else{
- int i;
- memset(p, 0, sizeof(AsciiTokenizer));
- memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
- for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
- const char *zArg = azArg[i+1];
- if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
- fts5AsciiAddExceptions(p, zArg, 1);
- }else
- if( 0==sqlite3_stricmp(azArg[i], "separators") ){
- fts5AsciiAddExceptions(p, zArg, 0);
- }else{
- rc = SQLITE_ERROR;
- }
- }
- if( rc!=SQLITE_OK ){
- fts5AsciiDelete((Fts5Tokenizer*)p);
- p = 0;
- }
- }
- }
- *ppOut = (Fts5Tokenizer*)p;
- return rc;
- }
- static void asciiFold(char *aOut, const char *aIn, int nByte){
- int i;
- for(i=0; i<nByte; i++){
- char c = aIn[i];
- if( c>='A' && c<='Z' ) c += 32;
- aOut[i] = c;
- }
- }
- /*
- ** Tokenize some text using the ascii tokenizer.
- */
- static int fts5AsciiTokenize(
- Fts5Tokenizer *pTokenizer,
- void *pCtx,
- int iUnused,
- const char *pText, int nText,
- int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
- ){
- AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
- int rc = SQLITE_OK;
- int ie;
- int is = 0;
- char aFold[64];
- int nFold = sizeof(aFold);
- char *pFold = aFold;
- unsigned char *a = p->aTokenChar;
- UNUSED_PARAM(iUnused);
- while( is<nText && rc==SQLITE_OK ){
- int nByte;
- /* Skip any leading divider characters. */
- while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
- is++;
- }
- if( is==nText ) break;
- /* Count the token characters */
- ie = is+1;
- while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
- ie++;
- }
- /* Fold to lower case */
- nByte = ie-is;
- if( nByte>nFold ){
- if( pFold!=aFold ) sqlite3_free(pFold);
- pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
- if( pFold==0 ){
- rc = SQLITE_NOMEM;
- break;
- }
- nFold = nByte*2;
- }
- asciiFold(pFold, &pText[is], nByte);
- /* Invoke the token callback */
- rc = xToken(pCtx, 0, pFold, nByte, is, ie);
- is = ie+1;
- }
-
- if( pFold!=aFold ) sqlite3_free(pFold);
- if( rc==SQLITE_DONE ) rc = SQLITE_OK;
- return rc;
- }
- /**************************************************************************
- ** Start of unicode61 tokenizer implementation.
- */
- /*
- ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
- ** from the sqlite3 source file utf.c. If this file is compiled as part
- ** of the amalgamation, they are not required.
- */
- #ifndef SQLITE_AMALGAMATION
- static const unsigned char sqlite3Utf8Trans1[] = {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
- 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
- };
- #define READ_UTF8(zIn, zTerm, c) \
- c = *(zIn++); \
- if( c>=0xc0 ){ \
- c = sqlite3Utf8Trans1[c-0xc0]; \
- while( zIn<zTerm && (*zIn & 0xc0)==0x80 ){ \
- c = (c<<6) + (0x3f & *(zIn++)); \
- } \
- if( c<0x80 \
- || (c&0xFFFFF800)==0xD800 \
- || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
- }
- #define WRITE_UTF8(zOut, c) { \
- if( c<0x00080 ){ \
- *zOut++ = (unsigned char)(c&0xFF); \
- } \
- else if( c<0x00800 ){ \
- *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
- *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
- } \
- else if( c<0x10000 ){ \
- *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
- *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
- *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
- }else{ \
- *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
- *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
- *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
- *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
- } \
- }
- #endif /* ifndef SQLITE_AMALGAMATION */
- #define FTS5_SKIP_UTF8(zIn) { \
- if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \
- while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \
- } \
- }
- typedef struct Unicode61Tokenizer Unicode61Tokenizer;
- struct Unicode61Tokenizer {
- unsigned char aTokenChar[128]; /* ASCII range token characters */
- char *aFold; /* Buffer to fold text into */
- int nFold; /* Size of aFold[] in bytes */
- int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
- int nException;
- int *aiException;
- unsigned char aCategory[32]; /* True for token char categories */
- };
- /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
- #define FTS5_REMOVE_DIACRITICS_NONE 0
- #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
- #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
- static int fts5UnicodeAddExceptions(
- Unicode61Tokenizer *p, /* Tokenizer object */
- const char *z, /* Characters to treat as exceptions */
- int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
- ){
- int rc = SQLITE_OK;
- int n = (int)strlen(z);
- int *aNew;
- if( n>0 ){
- aNew = (int*)sqlite3_realloc64(p->aiException,
- (n+p->nException)*sizeof(int));
- if( aNew ){
- int nNew = p->nException;
- const unsigned char *zCsr = (const unsigned char*)z;
- const unsigned char *zTerm = (const unsigned char*)&z[n];
- while( zCsr<zTerm ){
- u32 iCode;
- int bToken;
- READ_UTF8(zCsr, zTerm, iCode);
- if( iCode<128 ){
- p->aTokenChar[iCode] = (unsigned char)bTokenChars;
- }else{
- bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
- assert( (bToken==0 || bToken==1) );
- assert( (bTokenChars==0 || bTokenChars==1) );
- if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
- int i;
- for(i=0; i<nNew; i++){
- if( (u32)aNew[i]>iCode ) break;
- }
- memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
- aNew[i] = iCode;
- nNew++;
- }
- }
- }
- p->aiException = aNew;
- p->nException = nNew;
- }else{
- rc = SQLITE_NOMEM;
- }
- }
- return rc;
- }
- /*
- ** Return true if the p->aiException[] array contains the value iCode.
- */
- static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
- if( p->nException>0 ){
- int *a = p->aiException;
- int iLo = 0;
- int iHi = p->nException-1;
- while( iHi>=iLo ){
- int iTest = (iHi + iLo) / 2;
- if( iCode==a[iTest] ){
- return 1;
- }else if( iCode>a[iTest] ){
- iLo = iTest+1;
- }else{
- iHi = iTest-1;
- }
- }
- }
- return 0;
- }
- /*
- ** Delete a "unicode61" tokenizer.
- */
- static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
- if( pTok ){
- Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
- sqlite3_free(p->aiException);
- sqlite3_free(p->aFold);
- sqlite3_free(p);
- }
- return;
- }
- static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
- const char *z = zCat;
- while( *z ){
- while( *z==' ' || *z=='\t' ) z++;
- if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
- return SQLITE_ERROR;
- }
- while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
- }
- sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
- return SQLITE_OK;
- }
- /*
- ** Create a "unicode61" tokenizer.
- */
- static int fts5UnicodeCreate(
- void *pUnused,
- const char **azArg, int nArg,
- Fts5Tokenizer **ppOut
- ){
- int rc = SQLITE_OK; /* Return code */
- Unicode61Tokenizer *p = 0; /* New tokenizer object */
- UNUSED_PARAM(pUnused);
- if( nArg%2 ){
- rc = SQLITE_ERROR;
- }else{
- p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
- if( p ){
- const char *zCat = "L* N* Co";
- int i;
- memset(p, 0, sizeof(Unicode61Tokenizer));
- p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
- p->nFold = 64;
- p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
- if( p->aFold==0 ){
- rc = SQLITE_NOMEM;
- }
- /* Search for a "categories" argument */
- for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
- if( 0==sqlite3_stricmp(azArg[i], "categories") ){
- zCat = azArg[i+1];
- }
- }
- if( rc==SQLITE_OK ){
- rc = unicodeSetCategories(p, zCat);
- }
- for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
- const char *zArg = azArg[i+1];
- if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
- if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
- rc = SQLITE_ERROR;
- }else{
- p->eRemoveDiacritic = (zArg[0] - '0');
- assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
- || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
- || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
- );
- }
- }else
- if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
- rc = fts5UnicodeAddExceptions(p, zArg, 1);
- }else
- if( 0==sqlite3_stricmp(azArg[i], "separators") ){
- rc = fts5UnicodeAddExceptions(p, zArg, 0);
- }else
- if( 0==sqlite3_stricmp(azArg[i], "categories") ){
- /* no-op */
- }else{
- rc = SQLITE_ERROR;
- }
- }
- }else{
- rc = SQLITE_NOMEM;
- }
- if( rc!=SQLITE_OK ){
- fts5UnicodeDelete((Fts5Tokenizer*)p);
- p = 0;
- }
- *ppOut = (Fts5Tokenizer*)p;
- }
- return rc;
- }
- /*
- ** Return true if, for the purposes of tokenizing with the tokenizer
- ** passed as the first argument, codepoint iCode is considered a token
- ** character (not a separator).
- */
- static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
- return (
- p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
- ^ fts5UnicodeIsException(p, iCode)
- );
- }
- static int fts5UnicodeTokenize(
- Fts5Tokenizer *pTokenizer,
- void *pCtx,
- int iUnused,
- const char *pText, int nText,
- int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
- ){
- Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
- int rc = SQLITE_OK;
- unsigned char *a = p->aTokenChar;
- unsigned char *zTerm = (unsigned char*)&pText[nText];
- unsigned char *zCsr = (unsigned char *)pText;
- /* Output buffer */
- char *aFold = p->aFold;
- int nFold = p->nFold;
- const char *pEnd = &aFold[nFold-6];
- UNUSED_PARAM(iUnused);
- /* Each iteration of this loop gobbles up a contiguous run of separators,
- ** then the next token. */
- while( rc==SQLITE_OK ){
- u32 iCode; /* non-ASCII codepoint read from input */
- char *zOut = aFold;
- int is;
- int ie;
- /* Skip any separator characters. */
- while( 1 ){
- if( zCsr>=zTerm ) goto tokenize_done;
- if( *zCsr & 0x80 ) {
- /* A character outside of the ascii range. Skip past it if it is
- ** a separator character. Or break out of the loop if it is not. */
- is = zCsr - (unsigned char*)pText;
- READ_UTF8(zCsr, zTerm, iCode);
- if( fts5UnicodeIsAlnum(p, iCode) ){
- goto non_ascii_tokenchar;
- }
- }else{
- if( a[*zCsr] ){
- is = zCsr - (unsigned char*)pText;
- goto ascii_tokenchar;
- }
- zCsr++;
- }
- }
- /* Run through the tokenchars. Fold them into the output buffer along
- ** the way. */
- while( zCsr<zTerm ){
- /* Grow the output buffer so that there is sufficient space to fit the
- ** largest possible utf-8 character. */
- if( zOut>pEnd ){
- aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
- if( aFold==0 ){
- rc = SQLITE_NOMEM;
- goto tokenize_done;
- }
- zOut = &aFold[zOut - p->aFold];
- memcpy(aFold, p->aFold, nFold);
- sqlite3_free(p->aFold);
- p->aFold = aFold;
- p->nFold = nFold = nFold*2;
- pEnd = &aFold[nFold-6];
- }
- if( *zCsr & 0x80 ){
- /* An non-ascii-range character. Fold it into the output buffer if
- ** it is a token character, or break out of the loop if it is not. */
- READ_UTF8(zCsr, zTerm, iCode);
- if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
- non_ascii_tokenchar:
- iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
- if( iCode ) WRITE_UTF8(zOut, iCode);
- }else{
- break;
- }
- }else if( a[*zCsr]==0 ){
- /* An ascii-range separator character. End of token. */
- break;
- }else{
- ascii_tokenchar:
- if( *zCsr>='A' && *zCsr<='Z' ){
- *zOut++ = *zCsr + 32;
- }else{
- *zOut++ = *zCsr;
- }
- zCsr++;
- }
- ie = zCsr - (unsigned char*)pText;
- }
- /* Invoke the token callback */
- rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
- }
-
- tokenize_done:
- if( rc==SQLITE_DONE ) rc = SQLITE_OK;
- return rc;
- }
- /**************************************************************************
- ** Start of porter stemmer implementation.
- */
- /* Any tokens larger than this (in bytes) are passed through without
- ** stemming. */
- #define FTS5_PORTER_MAX_TOKEN 64
- typedef struct PorterTokenizer PorterTokenizer;
- struct PorterTokenizer {
- fts5_tokenizer_v2 tokenizer_v2; /* Parent tokenizer module */
- Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
- char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
- };
- /*
- ** Delete a "porter" tokenizer.
- */
- static void fts5PorterDelete(Fts5Tokenizer *pTok){
- if( pTok ){
- PorterTokenizer *p = (PorterTokenizer*)pTok;
- if( p->pTokenizer ){
- p->tokenizer_v2.xDelete(p->pTokenizer);
- }
- sqlite3_free(p);
- }
- }
- /*
- ** Create a "porter" tokenizer.
- */
- static int fts5PorterCreate(
- void *pCtx,
- const char **azArg, int nArg,
- Fts5Tokenizer **ppOut
- ){
- fts5_api *pApi = (fts5_api*)pCtx;
- int rc = SQLITE_OK;
- PorterTokenizer *pRet;
- void *pUserdata = 0;
- const char *zBase = "unicode61";
- fts5_tokenizer_v2 *pV2 = 0;
- if( nArg>0 ){
- zBase = azArg[0];
- }
- pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
- if( pRet ){
- memset(pRet, 0, sizeof(PorterTokenizer));
- rc = pApi->xFindTokenizer_v2(pApi, zBase, &pUserdata, &pV2);
- }else{
- rc = SQLITE_NOMEM;
- }
- if( rc==SQLITE_OK ){
- int nArg2 = (nArg>0 ? nArg-1 : 0);
- const char **az2 = (nArg2 ? &azArg[1] : 0);
- memcpy(&pRet->tokenizer_v2, pV2, sizeof(fts5_tokenizer_v2));
- rc = pRet->tokenizer_v2.xCreate(pUserdata, az2, nArg2, &pRet->pTokenizer);
- }
- if( rc!=SQLITE_OK ){
- fts5PorterDelete((Fts5Tokenizer*)pRet);
- pRet = 0;
- }
- *ppOut = (Fts5Tokenizer*)pRet;
- return rc;
- }
- typedef struct PorterContext PorterContext;
- struct PorterContext {
- void *pCtx;
- int (*xToken)(void*, int, const char*, int, int, int);
- char *aBuf;
- };
- typedef struct PorterRule PorterRule;
- struct PorterRule {
- const char *zSuffix;
- int nSuffix;
- int (*xCond)(char *zStem, int nStem);
- const char *zOutput;
- int nOutput;
- };
- #if 0
- static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
- int ret = -1;
- int nBuf = *pnBuf;
- PorterRule *p;
- for(p=aRule; p->zSuffix; p++){
- assert( strlen(p->zSuffix)==p->nSuffix );
- assert( strlen(p->zOutput)==p->nOutput );
- if( nBuf<p->nSuffix ) continue;
- if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
- }
- if( p->zSuffix ){
- int nStem = nBuf - p->nSuffix;
- if( p->xCond==0 || p->xCond(aBuf, nStem) ){
- memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
- *pnBuf = nStem + p->nOutput;
- ret = p - aRule;
- }
- }
- return ret;
- }
- #endif
- static int fts5PorterIsVowel(char c, int bYIsVowel){
- return (
- c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
- );
- }
- static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
- int i;
- int bCons = bPrevCons;
- /* Scan for a vowel */
- for(i=0; i<nStem; i++){
- if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
- }
- /* Scan for a consonent */
- for(i++; i<nStem; i++){
- if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
- }
- return 0;
- }
- /* porter rule condition: (m > 0) */
- static int fts5Porter_MGt0(char *zStem, int nStem){
- return !!fts5PorterGobbleVC(zStem, nStem, 0);
- }
- /* porter rule condition: (m > 1) */
- static int fts5Porter_MGt1(char *zStem, int nStem){
- int n;
- n = fts5PorterGobbleVC(zStem, nStem, 0);
- if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
- return 1;
- }
- return 0;
- }
- /* porter rule condition: (m = 1) */
- static int fts5Porter_MEq1(char *zStem, int nStem){
- int n;
- n = fts5PorterGobbleVC(zStem, nStem, 0);
- if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
- return 1;
- }
- return 0;
- }
- /* porter rule condition: (*o) */
- static int fts5Porter_Ostar(char *zStem, int nStem){
- if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
- return 0;
- }else{
- int i;
- int mask = 0;
- int bCons = 0;
- for(i=0; i<nStem; i++){
- bCons = !fts5PorterIsVowel(zStem[i], bCons);
- assert( bCons==0 || bCons==1 );
- mask = (mask << 1) + bCons;
- }
- return ((mask & 0x0007)==0x0005);
- }
- }
- /* porter rule condition: (m > 1 and (*S or *T)) */
- static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
- assert( nStem>0 );
- return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
- && fts5Porter_MGt1(zStem, nStem);
- }
- /* porter rule condition: (*v*) */
- static int fts5Porter_Vowel(char *zStem, int nStem){
- int i;
- for(i=0; i<nStem; i++){
- if( fts5PorterIsVowel(zStem[i], i>0) ){
- return 1;
- }
- }
- return 0;
- }
- /**************************************************************************
- ***************************************************************************
- ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
- */
- static int fts5PorterStep4(char *aBuf, int *pnBuf){
- int ret = 0;
- int nBuf = *pnBuf;
- switch( aBuf[nBuf-2] ){
-
- case 'a':
- if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
- if( fts5Porter_MGt1(aBuf, nBuf-2) ){
- *pnBuf = nBuf - 2;
- }
- }
- break;
-
- case 'c':
- if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt1(aBuf, nBuf-4) ){
- *pnBuf = nBuf - 4;
- }
- }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt1(aBuf, nBuf-4) ){
- *pnBuf = nBuf - 4;
- }
- }
- break;
-
- case 'e':
- if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
- if( fts5Porter_MGt1(aBuf, nBuf-2) ){
- *pnBuf = nBuf - 2;
- }
- }
- break;
-
- case 'i':
- if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
- if( fts5Porter_MGt1(aBuf, nBuf-2) ){
- *pnBuf = nBuf - 2;
- }
- }
- break;
-
- case 'l':
- if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt1(aBuf, nBuf-4) ){
- *pnBuf = nBuf - 4;
- }
- }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt1(aBuf, nBuf-4) ){
- *pnBuf = nBuf - 4;
- }
- }
- break;
-
- case 'n':
- if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt1(aBuf, nBuf-5) ){
- *pnBuf = nBuf - 5;
- }
- }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt1(aBuf, nBuf-4) ){
- *pnBuf = nBuf - 4;
- }
- }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- case 'o':
- if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
- if( fts5Porter_MGt1(aBuf, nBuf-2) ){
- *pnBuf = nBuf - 2;
- }
- }
- break;
-
- case 's':
- if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- case 't':
- if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- case 'u':
- if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- case 'v':
- if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- case 'z':
- if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt1(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- }
- return ret;
- }
-
- static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
- int ret = 0;
- int nBuf = *pnBuf;
- switch( aBuf[nBuf-2] ){
-
- case 'a':
- if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
- memcpy(&aBuf[nBuf-2], "ate", 3);
- *pnBuf = nBuf - 2 + 3;
- ret = 1;
- }
- break;
-
- case 'b':
- if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
- memcpy(&aBuf[nBuf-2], "ble", 3);
- *pnBuf = nBuf - 2 + 3;
- ret = 1;
- }
- break;
-
- case 'i':
- if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
- memcpy(&aBuf[nBuf-2], "ize", 3);
- *pnBuf = nBuf - 2 + 3;
- ret = 1;
- }
- break;
-
- }
- return ret;
- }
-
- static int fts5PorterStep2(char *aBuf, int *pnBuf){
- int ret = 0;
- int nBuf = *pnBuf;
- switch( aBuf[nBuf-2] ){
-
- case 'a':
- if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
- if( fts5Porter_MGt0(aBuf, nBuf-7) ){
- memcpy(&aBuf[nBuf-7], "ate", 3);
- *pnBuf = nBuf - 7 + 3;
- }
- }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
- if( fts5Porter_MGt0(aBuf, nBuf-6) ){
- memcpy(&aBuf[nBuf-6], "tion", 4);
- *pnBuf = nBuf - 6 + 4;
- }
- }
- break;
-
- case 'c':
- if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "ence", 4);
- *pnBuf = nBuf - 4 + 4;
- }
- }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "ance", 4);
- *pnBuf = nBuf - 4 + 4;
- }
- }
- break;
-
- case 'e':
- if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "ize", 3);
- *pnBuf = nBuf - 4 + 3;
- }
- }
- break;
-
- case 'g':
- if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "log", 3);
- *pnBuf = nBuf - 4 + 3;
- }
- }
- break;
-
- case 'l':
- if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt0(aBuf, nBuf-3) ){
- memcpy(&aBuf[nBuf-3], "ble", 3);
- *pnBuf = nBuf - 3 + 3;
- }
- }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "al", 2);
- *pnBuf = nBuf - 4 + 2;
- }
- }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "ent", 3);
- *pnBuf = nBuf - 5 + 3;
- }
- }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt0(aBuf, nBuf-3) ){
- memcpy(&aBuf[nBuf-3], "e", 1);
- *pnBuf = nBuf - 3 + 1;
- }
- }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "ous", 3);
- *pnBuf = nBuf - 5 + 3;
- }
- }
- break;
-
- case 'o':
- if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
- if( fts5Porter_MGt0(aBuf, nBuf-7) ){
- memcpy(&aBuf[nBuf-7], "ize", 3);
- *pnBuf = nBuf - 7 + 3;
- }
- }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "ate", 3);
- *pnBuf = nBuf - 5 + 3;
- }
- }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "ate", 3);
- *pnBuf = nBuf - 4 + 3;
- }
- }
- break;
-
- case 's':
- if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "al", 2);
- *pnBuf = nBuf - 5 + 2;
- }
- }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
- if( fts5Porter_MGt0(aBuf, nBuf-7) ){
- memcpy(&aBuf[nBuf-7], "ive", 3);
- *pnBuf = nBuf - 7 + 3;
- }
- }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
- if( fts5Porter_MGt0(aBuf, nBuf-7) ){
- memcpy(&aBuf[nBuf-7], "ful", 3);
- *pnBuf = nBuf - 7 + 3;
- }
- }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
- if( fts5Porter_MGt0(aBuf, nBuf-7) ){
- memcpy(&aBuf[nBuf-7], "ous", 3);
- *pnBuf = nBuf - 7 + 3;
- }
- }
- break;
-
- case 't':
- if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "al", 2);
- *pnBuf = nBuf - 5 + 2;
- }
- }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "ive", 3);
- *pnBuf = nBuf - 5 + 3;
- }
- }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
- if( fts5Porter_MGt0(aBuf, nBuf-6) ){
- memcpy(&aBuf[nBuf-6], "ble", 3);
- *pnBuf = nBuf - 6 + 3;
- }
- }
- break;
-
- }
- return ret;
- }
-
- static int fts5PorterStep3(char *aBuf, int *pnBuf){
- int ret = 0;
- int nBuf = *pnBuf;
- switch( aBuf[nBuf-2] ){
-
- case 'a':
- if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- memcpy(&aBuf[nBuf-4], "ic", 2);
- *pnBuf = nBuf - 4 + 2;
- }
- }
- break;
-
- case 's':
- if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
- if( fts5Porter_MGt0(aBuf, nBuf-4) ){
- *pnBuf = nBuf - 4;
- }
- }
- break;
-
- case 't':
- if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "ic", 2);
- *pnBuf = nBuf - 5 + 2;
- }
- }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "ic", 2);
- *pnBuf = nBuf - 5 + 2;
- }
- }
- break;
-
- case 'u':
- if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt0(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- }
- }
- break;
-
- case 'v':
- if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- *pnBuf = nBuf - 5;
- }
- }
- break;
-
- case 'z':
- if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
- if( fts5Porter_MGt0(aBuf, nBuf-5) ){
- memcpy(&aBuf[nBuf-5], "al", 2);
- *pnBuf = nBuf - 5 + 2;
- }
- }
- break;
-
- }
- return ret;
- }
-
- static int fts5PorterStep1B(char *aBuf, int *pnBuf){
- int ret = 0;
- int nBuf = *pnBuf;
- switch( aBuf[nBuf-2] ){
-
- case 'e':
- if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_MGt0(aBuf, nBuf-3) ){
- memcpy(&aBuf[nBuf-3], "ee", 2);
- *pnBuf = nBuf - 3 + 2;
- }
- }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
- if( fts5Porter_Vowel(aBuf, nBuf-2) ){
- *pnBuf = nBuf - 2;
- ret = 1;
- }
- }
- break;
-
- case 'n':
- if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
- if( fts5Porter_Vowel(aBuf, nBuf-3) ){
- *pnBuf = nBuf - 3;
- ret = 1;
- }
- }
- break;
-
- }
- return ret;
- }
-
- /*
- ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
- ***************************************************************************
- **************************************************************************/
- static void fts5PorterStep1A(char *aBuf, int *pnBuf){
- int nBuf = *pnBuf;
- if( aBuf[nBuf-1]=='s' ){
- if( aBuf[nBuf-2]=='e' ){
- if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
- || (nBuf>3 && aBuf[nBuf-3]=='i' )
- ){
- *pnBuf = nBuf-2;
- }else{
- *pnBuf = nBuf-1;
- }
- }
- else if( aBuf[nBuf-2]!='s' ){
- *pnBuf = nBuf-1;
- }
- }
- }
- static int fts5PorterCb(
- void *pCtx,
- int tflags,
- const char *pToken,
- int nToken,
- int iStart,
- int iEnd
- ){
- PorterContext *p = (PorterContext*)pCtx;
- char *aBuf;
- int nBuf;
- if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
- aBuf = p->aBuf;
- nBuf = nToken;
- memcpy(aBuf, pToken, nBuf);
- /* Step 1. */
- fts5PorterStep1A(aBuf, &nBuf);
- if( fts5PorterStep1B(aBuf, &nBuf) ){
- if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
- char c = aBuf[nBuf-1];
- if( fts5PorterIsVowel(c, 0)==0
- && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
- ){
- nBuf--;
- }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
- aBuf[nBuf++] = 'e';
- }
- }
- }
- /* Step 1C. */
- if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
- aBuf[nBuf-1] = 'i';
- }
- /* Steps 2 through 4. */
- fts5PorterStep2(aBuf, &nBuf);
- fts5PorterStep3(aBuf, &nBuf);
- fts5PorterStep4(aBuf, &nBuf);
- /* Step 5a. */
- assert( nBuf>0 );
- if( aBuf[nBuf-1]=='e' ){
- if( fts5Porter_MGt1(aBuf, nBuf-1)
- || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
- ){
- nBuf--;
- }
- }
- /* Step 5b. */
- if( nBuf>1 && aBuf[nBuf-1]=='l'
- && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
- ){
- nBuf--;
- }
- return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
- pass_through:
- return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
- }
- /*
- ** Tokenize using the porter tokenizer.
- */
- static int fts5PorterTokenize(
- Fts5Tokenizer *pTokenizer,
- void *pCtx,
- int flags,
- const char *pText, int nText,
- const char *pLoc, int nLoc,
- int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
- ){
- PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
- PorterContext sCtx;
- sCtx.xToken = xToken;
- sCtx.pCtx = pCtx;
- sCtx.aBuf = p->aBuf;
- return p->tokenizer_v2.xTokenize(
- p->pTokenizer, (void*)&sCtx, flags, pText, nText, pLoc, nLoc, fts5PorterCb
- );
- }
- /**************************************************************************
- ** Start of trigram implementation.
- */
- typedef struct TrigramTokenizer TrigramTokenizer;
- struct TrigramTokenizer {
- int bFold; /* True to fold to lower-case */
- int iFoldParam; /* Parameter to pass to Fts5UnicodeFold() */
- };
- /*
- ** Free a trigram tokenizer.
- */
- static void fts5TriDelete(Fts5Tokenizer *p){
- sqlite3_free(p);
- }
- /*
- ** Allocate a trigram tokenizer.
- */
- static int fts5TriCreate(
- void *pUnused,
- const char **azArg,
- int nArg,
- Fts5Tokenizer **ppOut
- ){
- int rc = SQLITE_OK;
- TrigramTokenizer *pNew = 0;
- UNUSED_PARAM(pUnused);
- if( nArg%2 ){
- rc = SQLITE_ERROR;
- }else{
- int i;
- pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
- if( pNew==0 ){
- rc = SQLITE_NOMEM;
- }else{
- pNew->bFold = 1;
- pNew->iFoldParam = 0;
-
- for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
- const char *zArg = azArg[i+1];
- if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
- if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
- rc = SQLITE_ERROR;
- }else{
- pNew->bFold = (zArg[0]=='0');
- }
- }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
- if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
- rc = SQLITE_ERROR;
- }else{
- pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
- }
- }else{
- rc = SQLITE_ERROR;
- }
- }
-
- if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
- rc = SQLITE_ERROR;
- }
-
- if( rc!=SQLITE_OK ){
- fts5TriDelete((Fts5Tokenizer*)pNew);
- pNew = 0;
- }
- }
- }
- *ppOut = (Fts5Tokenizer*)pNew;
- return rc;
- }
- /*
- ** Trigram tokenizer tokenize routine.
- */
- static int fts5TriTokenize(
- Fts5Tokenizer *pTok,
- void *pCtx,
- int unusedFlags,
- const char *pText, int nText,
- int (*xToken)(void*, int, const char*, int, int, int)
- ){
- TrigramTokenizer *p = (TrigramTokenizer*)pTok;
- int rc = SQLITE_OK;
- char aBuf[32];
- char *zOut = aBuf;
- int ii;
- const unsigned char *zIn = (const unsigned char*)pText;
- const unsigned char *zEof = (zIn ? &zIn[nText] : 0);
- u32 iCode = 0;
- int aStart[3]; /* Input offset of each character in aBuf[] */
- UNUSED_PARAM(unusedFlags);
- /* Populate aBuf[] with the characters for the first trigram. */
- for(ii=0; ii<3; ii++){
- do {
- aStart[ii] = zIn - (const unsigned char*)pText;
- if( zIn>=zEof ) return SQLITE_OK;
- READ_UTF8(zIn, zEof, iCode);
- if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
- }while( iCode==0 );
- WRITE_UTF8(zOut, iCode);
- }
- /* At the start of each iteration of this loop:
- **
- ** aBuf: Contains 3 characters. The 3 characters of the next trigram.
- ** zOut: Points to the byte following the last character in aBuf.
- ** aStart[3]: Contains the byte offset in the input text corresponding
- ** to the start of each of the three characters in the buffer.
- */
- assert( zIn<=zEof );
- while( 1 ){
- int iNext; /* Start of character following current tri */
- const char *z1;
- /* Read characters from the input up until the first non-diacritic */
- do {
- iNext = zIn - (const unsigned char*)pText;
- if( zIn>=zEof ){
- iCode = 0;
- break;
- }
- READ_UTF8(zIn, zEof, iCode);
- if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
- }while( iCode==0 );
- /* Pass the current trigram back to fts5 */
- rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
- if( iCode==0 || rc!=SQLITE_OK ) break;
- /* Remove the first character from buffer aBuf[]. Append the character
- ** with codepoint iCode. */
- z1 = aBuf;
- FTS5_SKIP_UTF8(z1);
- memmove(aBuf, z1, zOut - z1);
- zOut -= (z1 - aBuf);
- WRITE_UTF8(zOut, iCode);
- /* Update the aStart[] array */
- aStart[0] = aStart[1];
- aStart[1] = aStart[2];
- aStart[2] = iNext;
- }
- return rc;
- }
- /*
- ** Argument xCreate is a pointer to a constructor function for a tokenizer.
- ** pTok is a tokenizer previously created using the same method. This function
- ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
- ** indicating the style of pattern matching that the tokenizer can support.
- ** In practice, this is:
- **
- ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
- ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
- ** all other tokenizers - FTS5_PATTERN_NONE
- */
- int sqlite3Fts5TokenizerPattern(
- int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
- Fts5Tokenizer *pTok
- ){
- if( xCreate==fts5TriCreate ){
- TrigramTokenizer *p = (TrigramTokenizer*)pTok;
- if( p->iFoldParam==0 ){
- return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
- }
- }
- return FTS5_PATTERN_NONE;
- }
- /*
- ** Return true if the tokenizer described by p->azArg[] is the trigram
- ** tokenizer. This tokenizer needs to be loaded before xBestIndex is
- ** called for the first time in order to correctly handle LIKE/GLOB.
- */
- int sqlite3Fts5TokenizerPreload(Fts5TokenizerConfig *p){
- return (p->nArg>=1 && 0==sqlite3_stricmp(p->azArg[0], "trigram"));
- }
- /*
- ** Register all built-in tokenizers with FTS5.
- */
- int sqlite3Fts5TokenizerInit(fts5_api *pApi){
- struct BuiltinTokenizer {
- const char *zName;
- fts5_tokenizer x;
- } aBuiltin[] = {
- { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
- { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
- { "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
- };
-
- int rc = SQLITE_OK; /* Return code */
- int i; /* To iterate through builtin functions */
- for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
- rc = pApi->xCreateTokenizer(pApi,
- aBuiltin[i].zName,
- (void*)pApi,
- &aBuiltin[i].x,
- 0
- );
- }
- if( rc==SQLITE_OK ){
- fts5_tokenizer_v2 sPorter = {
- 2,
- fts5PorterCreate,
- fts5PorterDelete,
- fts5PorterTokenize
- };
- rc = pApi->xCreateTokenizer_v2(pApi,
- "porter",
- (void*)pApi,
- &sPorter,
- 0
- );
- }
- return rc;
- }
|