fts5_tokenize.c 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491
  1. /*
  2. ** 2014 May 31
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. */
  13. #include "fts5Int.h"
  14. /**************************************************************************
  15. ** Start of ascii tokenizer implementation.
  16. */
  17. /*
  18. ** For tokenizers with no "unicode" modifier, the set of token characters
  19. ** is the same as the set of ASCII range alphanumeric characters.
  20. */
  21. static unsigned char aAsciiTokenChar[128] = {
  22. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
  23. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
  24. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
  25. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
  26. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
  27. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
  28. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
  29. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
  30. };
  31. typedef struct AsciiTokenizer AsciiTokenizer;
  32. struct AsciiTokenizer {
  33. unsigned char aTokenChar[128];
  34. };
  35. static void fts5AsciiAddExceptions(
  36. AsciiTokenizer *p,
  37. const char *zArg,
  38. int bTokenChars
  39. ){
  40. int i;
  41. for(i=0; zArg[i]; i++){
  42. if( (zArg[i] & 0x80)==0 ){
  43. p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
  44. }
  45. }
  46. }
  47. /*
  48. ** Delete a "ascii" tokenizer.
  49. */
  50. static void fts5AsciiDelete(Fts5Tokenizer *p){
  51. sqlite3_free(p);
  52. }
  53. /*
  54. ** Create an "ascii" tokenizer.
  55. */
  56. static int fts5AsciiCreate(
  57. void *pUnused,
  58. const char **azArg, int nArg,
  59. Fts5Tokenizer **ppOut
  60. ){
  61. int rc = SQLITE_OK;
  62. AsciiTokenizer *p = 0;
  63. UNUSED_PARAM(pUnused);
  64. if( nArg%2 ){
  65. rc = SQLITE_ERROR;
  66. }else{
  67. p = sqlite3_malloc(sizeof(AsciiTokenizer));
  68. if( p==0 ){
  69. rc = SQLITE_NOMEM;
  70. }else{
  71. int i;
  72. memset(p, 0, sizeof(AsciiTokenizer));
  73. memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
  74. for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
  75. const char *zArg = azArg[i+1];
  76. if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
  77. fts5AsciiAddExceptions(p, zArg, 1);
  78. }else
  79. if( 0==sqlite3_stricmp(azArg[i], "separators") ){
  80. fts5AsciiAddExceptions(p, zArg, 0);
  81. }else{
  82. rc = SQLITE_ERROR;
  83. }
  84. }
  85. if( rc!=SQLITE_OK ){
  86. fts5AsciiDelete((Fts5Tokenizer*)p);
  87. p = 0;
  88. }
  89. }
  90. }
  91. *ppOut = (Fts5Tokenizer*)p;
  92. return rc;
  93. }
  94. static void asciiFold(char *aOut, const char *aIn, int nByte){
  95. int i;
  96. for(i=0; i<nByte; i++){
  97. char c = aIn[i];
  98. if( c>='A' && c<='Z' ) c += 32;
  99. aOut[i] = c;
  100. }
  101. }
  102. /*
  103. ** Tokenize some text using the ascii tokenizer.
  104. */
  105. static int fts5AsciiTokenize(
  106. Fts5Tokenizer *pTokenizer,
  107. void *pCtx,
  108. int iUnused,
  109. const char *pText, int nText,
  110. int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
  111. ){
  112. AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
  113. int rc = SQLITE_OK;
  114. int ie;
  115. int is = 0;
  116. char aFold[64];
  117. int nFold = sizeof(aFold);
  118. char *pFold = aFold;
  119. unsigned char *a = p->aTokenChar;
  120. UNUSED_PARAM(iUnused);
  121. while( is<nText && rc==SQLITE_OK ){
  122. int nByte;
  123. /* Skip any leading divider characters. */
  124. while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
  125. is++;
  126. }
  127. if( is==nText ) break;
  128. /* Count the token characters */
  129. ie = is+1;
  130. while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
  131. ie++;
  132. }
  133. /* Fold to lower case */
  134. nByte = ie-is;
  135. if( nByte>nFold ){
  136. if( pFold!=aFold ) sqlite3_free(pFold);
  137. pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
  138. if( pFold==0 ){
  139. rc = SQLITE_NOMEM;
  140. break;
  141. }
  142. nFold = nByte*2;
  143. }
  144. asciiFold(pFold, &pText[is], nByte);
  145. /* Invoke the token callback */
  146. rc = xToken(pCtx, 0, pFold, nByte, is, ie);
  147. is = ie+1;
  148. }
  149. if( pFold!=aFold ) sqlite3_free(pFold);
  150. if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  151. return rc;
  152. }
  153. /**************************************************************************
  154. ** Start of unicode61 tokenizer implementation.
  155. */
  156. /*
  157. ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
  158. ** from the sqlite3 source file utf.c. If this file is compiled as part
  159. ** of the amalgamation, they are not required.
  160. */
  161. #ifndef SQLITE_AMALGAMATION
  162. static const unsigned char sqlite3Utf8Trans1[] = {
  163. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  164. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  165. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  166. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  167. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  168. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  169. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  170. 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
  171. };
  172. #define READ_UTF8(zIn, zTerm, c) \
  173. c = *(zIn++); \
  174. if( c>=0xc0 ){ \
  175. c = sqlite3Utf8Trans1[c-0xc0]; \
  176. while( zIn<zTerm && (*zIn & 0xc0)==0x80 ){ \
  177. c = (c<<6) + (0x3f & *(zIn++)); \
  178. } \
  179. if( c<0x80 \
  180. || (c&0xFFFFF800)==0xD800 \
  181. || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
  182. }
  183. #define WRITE_UTF8(zOut, c) { \
  184. if( c<0x00080 ){ \
  185. *zOut++ = (unsigned char)(c&0xFF); \
  186. } \
  187. else if( c<0x00800 ){ \
  188. *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
  189. *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
  190. } \
  191. else if( c<0x10000 ){ \
  192. *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
  193. *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
  194. *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
  195. }else{ \
  196. *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
  197. *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
  198. *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
  199. *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
  200. } \
  201. }
  202. #endif /* ifndef SQLITE_AMALGAMATION */
  203. #define FTS5_SKIP_UTF8(zIn) { \
  204. if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \
  205. while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \
  206. } \
  207. }
  208. typedef struct Unicode61Tokenizer Unicode61Tokenizer;
  209. struct Unicode61Tokenizer {
  210. unsigned char aTokenChar[128]; /* ASCII range token characters */
  211. char *aFold; /* Buffer to fold text into */
  212. int nFold; /* Size of aFold[] in bytes */
  213. int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
  214. int nException;
  215. int *aiException;
  216. unsigned char aCategory[32]; /* True for token char categories */
  217. };
  218. /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
  219. #define FTS5_REMOVE_DIACRITICS_NONE 0
  220. #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
  221. #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
  222. static int fts5UnicodeAddExceptions(
  223. Unicode61Tokenizer *p, /* Tokenizer object */
  224. const char *z, /* Characters to treat as exceptions */
  225. int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
  226. ){
  227. int rc = SQLITE_OK;
  228. int n = (int)strlen(z);
  229. int *aNew;
  230. if( n>0 ){
  231. aNew = (int*)sqlite3_realloc64(p->aiException,
  232. (n+p->nException)*sizeof(int));
  233. if( aNew ){
  234. int nNew = p->nException;
  235. const unsigned char *zCsr = (const unsigned char*)z;
  236. const unsigned char *zTerm = (const unsigned char*)&z[n];
  237. while( zCsr<zTerm ){
  238. u32 iCode;
  239. int bToken;
  240. READ_UTF8(zCsr, zTerm, iCode);
  241. if( iCode<128 ){
  242. p->aTokenChar[iCode] = (unsigned char)bTokenChars;
  243. }else{
  244. bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
  245. assert( (bToken==0 || bToken==1) );
  246. assert( (bTokenChars==0 || bTokenChars==1) );
  247. if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
  248. int i;
  249. for(i=0; i<nNew; i++){
  250. if( (u32)aNew[i]>iCode ) break;
  251. }
  252. memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
  253. aNew[i] = iCode;
  254. nNew++;
  255. }
  256. }
  257. }
  258. p->aiException = aNew;
  259. p->nException = nNew;
  260. }else{
  261. rc = SQLITE_NOMEM;
  262. }
  263. }
  264. return rc;
  265. }
  266. /*
  267. ** Return true if the p->aiException[] array contains the value iCode.
  268. */
  269. static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
  270. if( p->nException>0 ){
  271. int *a = p->aiException;
  272. int iLo = 0;
  273. int iHi = p->nException-1;
  274. while( iHi>=iLo ){
  275. int iTest = (iHi + iLo) / 2;
  276. if( iCode==a[iTest] ){
  277. return 1;
  278. }else if( iCode>a[iTest] ){
  279. iLo = iTest+1;
  280. }else{
  281. iHi = iTest-1;
  282. }
  283. }
  284. }
  285. return 0;
  286. }
  287. /*
  288. ** Delete a "unicode61" tokenizer.
  289. */
  290. static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
  291. if( pTok ){
  292. Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
  293. sqlite3_free(p->aiException);
  294. sqlite3_free(p->aFold);
  295. sqlite3_free(p);
  296. }
  297. return;
  298. }
  299. static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
  300. const char *z = zCat;
  301. while( *z ){
  302. while( *z==' ' || *z=='\t' ) z++;
  303. if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
  304. return SQLITE_ERROR;
  305. }
  306. while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
  307. }
  308. sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
  309. return SQLITE_OK;
  310. }
  311. /*
  312. ** Create a "unicode61" tokenizer.
  313. */
  314. static int fts5UnicodeCreate(
  315. void *pUnused,
  316. const char **azArg, int nArg,
  317. Fts5Tokenizer **ppOut
  318. ){
  319. int rc = SQLITE_OK; /* Return code */
  320. Unicode61Tokenizer *p = 0; /* New tokenizer object */
  321. UNUSED_PARAM(pUnused);
  322. if( nArg%2 ){
  323. rc = SQLITE_ERROR;
  324. }else{
  325. p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
  326. if( p ){
  327. const char *zCat = "L* N* Co";
  328. int i;
  329. memset(p, 0, sizeof(Unicode61Tokenizer));
  330. p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
  331. p->nFold = 64;
  332. p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
  333. if( p->aFold==0 ){
  334. rc = SQLITE_NOMEM;
  335. }
  336. /* Search for a "categories" argument */
  337. for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
  338. if( 0==sqlite3_stricmp(azArg[i], "categories") ){
  339. zCat = azArg[i+1];
  340. }
  341. }
  342. if( rc==SQLITE_OK ){
  343. rc = unicodeSetCategories(p, zCat);
  344. }
  345. for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
  346. const char *zArg = azArg[i+1];
  347. if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
  348. if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
  349. rc = SQLITE_ERROR;
  350. }else{
  351. p->eRemoveDiacritic = (zArg[0] - '0');
  352. assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
  353. || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
  354. || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
  355. );
  356. }
  357. }else
  358. if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
  359. rc = fts5UnicodeAddExceptions(p, zArg, 1);
  360. }else
  361. if( 0==sqlite3_stricmp(azArg[i], "separators") ){
  362. rc = fts5UnicodeAddExceptions(p, zArg, 0);
  363. }else
  364. if( 0==sqlite3_stricmp(azArg[i], "categories") ){
  365. /* no-op */
  366. }else{
  367. rc = SQLITE_ERROR;
  368. }
  369. }
  370. }else{
  371. rc = SQLITE_NOMEM;
  372. }
  373. if( rc!=SQLITE_OK ){
  374. fts5UnicodeDelete((Fts5Tokenizer*)p);
  375. p = 0;
  376. }
  377. *ppOut = (Fts5Tokenizer*)p;
  378. }
  379. return rc;
  380. }
  381. /*
  382. ** Return true if, for the purposes of tokenizing with the tokenizer
  383. ** passed as the first argument, codepoint iCode is considered a token
  384. ** character (not a separator).
  385. */
  386. static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
  387. return (
  388. p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
  389. ^ fts5UnicodeIsException(p, iCode)
  390. );
  391. }
  392. static int fts5UnicodeTokenize(
  393. Fts5Tokenizer *pTokenizer,
  394. void *pCtx,
  395. int iUnused,
  396. const char *pText, int nText,
  397. int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
  398. ){
  399. Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  400. int rc = SQLITE_OK;
  401. unsigned char *a = p->aTokenChar;
  402. unsigned char *zTerm = (unsigned char*)&pText[nText];
  403. unsigned char *zCsr = (unsigned char *)pText;
  404. /* Output buffer */
  405. char *aFold = p->aFold;
  406. int nFold = p->nFold;
  407. const char *pEnd = &aFold[nFold-6];
  408. UNUSED_PARAM(iUnused);
  409. /* Each iteration of this loop gobbles up a contiguous run of separators,
  410. ** then the next token. */
  411. while( rc==SQLITE_OK ){
  412. u32 iCode; /* non-ASCII codepoint read from input */
  413. char *zOut = aFold;
  414. int is;
  415. int ie;
  416. /* Skip any separator characters. */
  417. while( 1 ){
  418. if( zCsr>=zTerm ) goto tokenize_done;
  419. if( *zCsr & 0x80 ) {
  420. /* A character outside of the ascii range. Skip past it if it is
  421. ** a separator character. Or break out of the loop if it is not. */
  422. is = zCsr - (unsigned char*)pText;
  423. READ_UTF8(zCsr, zTerm, iCode);
  424. if( fts5UnicodeIsAlnum(p, iCode) ){
  425. goto non_ascii_tokenchar;
  426. }
  427. }else{
  428. if( a[*zCsr] ){
  429. is = zCsr - (unsigned char*)pText;
  430. goto ascii_tokenchar;
  431. }
  432. zCsr++;
  433. }
  434. }
  435. /* Run through the tokenchars. Fold them into the output buffer along
  436. ** the way. */
  437. while( zCsr<zTerm ){
  438. /* Grow the output buffer so that there is sufficient space to fit the
  439. ** largest possible utf-8 character. */
  440. if( zOut>pEnd ){
  441. aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
  442. if( aFold==0 ){
  443. rc = SQLITE_NOMEM;
  444. goto tokenize_done;
  445. }
  446. zOut = &aFold[zOut - p->aFold];
  447. memcpy(aFold, p->aFold, nFold);
  448. sqlite3_free(p->aFold);
  449. p->aFold = aFold;
  450. p->nFold = nFold = nFold*2;
  451. pEnd = &aFold[nFold-6];
  452. }
  453. if( *zCsr & 0x80 ){
  454. /* An non-ascii-range character. Fold it into the output buffer if
  455. ** it is a token character, or break out of the loop if it is not. */
  456. READ_UTF8(zCsr, zTerm, iCode);
  457. if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
  458. non_ascii_tokenchar:
  459. iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
  460. if( iCode ) WRITE_UTF8(zOut, iCode);
  461. }else{
  462. break;
  463. }
  464. }else if( a[*zCsr]==0 ){
  465. /* An ascii-range separator character. End of token. */
  466. break;
  467. }else{
  468. ascii_tokenchar:
  469. if( *zCsr>='A' && *zCsr<='Z' ){
  470. *zOut++ = *zCsr + 32;
  471. }else{
  472. *zOut++ = *zCsr;
  473. }
  474. zCsr++;
  475. }
  476. ie = zCsr - (unsigned char*)pText;
  477. }
  478. /* Invoke the token callback */
  479. rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
  480. }
  481. tokenize_done:
  482. if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  483. return rc;
  484. }
  485. /**************************************************************************
  486. ** Start of porter stemmer implementation.
  487. */
  488. /* Any tokens larger than this (in bytes) are passed through without
  489. ** stemming. */
  490. #define FTS5_PORTER_MAX_TOKEN 64
  491. typedef struct PorterTokenizer PorterTokenizer;
  492. struct PorterTokenizer {
  493. fts5_tokenizer_v2 tokenizer_v2; /* Parent tokenizer module */
  494. Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
  495. char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
  496. };
  497. /*
  498. ** Delete a "porter" tokenizer.
  499. */
  500. static void fts5PorterDelete(Fts5Tokenizer *pTok){
  501. if( pTok ){
  502. PorterTokenizer *p = (PorterTokenizer*)pTok;
  503. if( p->pTokenizer ){
  504. p->tokenizer_v2.xDelete(p->pTokenizer);
  505. }
  506. sqlite3_free(p);
  507. }
  508. }
  509. /*
  510. ** Create a "porter" tokenizer.
  511. */
  512. static int fts5PorterCreate(
  513. void *pCtx,
  514. const char **azArg, int nArg,
  515. Fts5Tokenizer **ppOut
  516. ){
  517. fts5_api *pApi = (fts5_api*)pCtx;
  518. int rc = SQLITE_OK;
  519. PorterTokenizer *pRet;
  520. void *pUserdata = 0;
  521. const char *zBase = "unicode61";
  522. fts5_tokenizer_v2 *pV2 = 0;
  523. if( nArg>0 ){
  524. zBase = azArg[0];
  525. }
  526. pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
  527. if( pRet ){
  528. memset(pRet, 0, sizeof(PorterTokenizer));
  529. rc = pApi->xFindTokenizer_v2(pApi, zBase, &pUserdata, &pV2);
  530. }else{
  531. rc = SQLITE_NOMEM;
  532. }
  533. if( rc==SQLITE_OK ){
  534. int nArg2 = (nArg>0 ? nArg-1 : 0);
  535. const char **az2 = (nArg2 ? &azArg[1] : 0);
  536. memcpy(&pRet->tokenizer_v2, pV2, sizeof(fts5_tokenizer_v2));
  537. rc = pRet->tokenizer_v2.xCreate(pUserdata, az2, nArg2, &pRet->pTokenizer);
  538. }
  539. if( rc!=SQLITE_OK ){
  540. fts5PorterDelete((Fts5Tokenizer*)pRet);
  541. pRet = 0;
  542. }
  543. *ppOut = (Fts5Tokenizer*)pRet;
  544. return rc;
  545. }
  546. typedef struct PorterContext PorterContext;
  547. struct PorterContext {
  548. void *pCtx;
  549. int (*xToken)(void*, int, const char*, int, int, int);
  550. char *aBuf;
  551. };
  552. typedef struct PorterRule PorterRule;
  553. struct PorterRule {
  554. const char *zSuffix;
  555. int nSuffix;
  556. int (*xCond)(char *zStem, int nStem);
  557. const char *zOutput;
  558. int nOutput;
  559. };
  560. #if 0
  561. static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
  562. int ret = -1;
  563. int nBuf = *pnBuf;
  564. PorterRule *p;
  565. for(p=aRule; p->zSuffix; p++){
  566. assert( strlen(p->zSuffix)==p->nSuffix );
  567. assert( strlen(p->zOutput)==p->nOutput );
  568. if( nBuf<p->nSuffix ) continue;
  569. if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
  570. }
  571. if( p->zSuffix ){
  572. int nStem = nBuf - p->nSuffix;
  573. if( p->xCond==0 || p->xCond(aBuf, nStem) ){
  574. memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
  575. *pnBuf = nStem + p->nOutput;
  576. ret = p - aRule;
  577. }
  578. }
  579. return ret;
  580. }
  581. #endif
  582. static int fts5PorterIsVowel(char c, int bYIsVowel){
  583. return (
  584. c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
  585. );
  586. }
  587. static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
  588. int i;
  589. int bCons = bPrevCons;
  590. /* Scan for a vowel */
  591. for(i=0; i<nStem; i++){
  592. if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
  593. }
  594. /* Scan for a consonent */
  595. for(i++; i<nStem; i++){
  596. if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
  597. }
  598. return 0;
  599. }
  600. /* porter rule condition: (m > 0) */
  601. static int fts5Porter_MGt0(char *zStem, int nStem){
  602. return !!fts5PorterGobbleVC(zStem, nStem, 0);
  603. }
  604. /* porter rule condition: (m > 1) */
  605. static int fts5Porter_MGt1(char *zStem, int nStem){
  606. int n;
  607. n = fts5PorterGobbleVC(zStem, nStem, 0);
  608. if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
  609. return 1;
  610. }
  611. return 0;
  612. }
  613. /* porter rule condition: (m = 1) */
  614. static int fts5Porter_MEq1(char *zStem, int nStem){
  615. int n;
  616. n = fts5PorterGobbleVC(zStem, nStem, 0);
  617. if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
  618. return 1;
  619. }
  620. return 0;
  621. }
  622. /* porter rule condition: (*o) */
  623. static int fts5Porter_Ostar(char *zStem, int nStem){
  624. if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
  625. return 0;
  626. }else{
  627. int i;
  628. int mask = 0;
  629. int bCons = 0;
  630. for(i=0; i<nStem; i++){
  631. bCons = !fts5PorterIsVowel(zStem[i], bCons);
  632. assert( bCons==0 || bCons==1 );
  633. mask = (mask << 1) + bCons;
  634. }
  635. return ((mask & 0x0007)==0x0005);
  636. }
  637. }
  638. /* porter rule condition: (m > 1 and (*S or *T)) */
  639. static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
  640. assert( nStem>0 );
  641. return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
  642. && fts5Porter_MGt1(zStem, nStem);
  643. }
  644. /* porter rule condition: (*v*) */
  645. static int fts5Porter_Vowel(char *zStem, int nStem){
  646. int i;
  647. for(i=0; i<nStem; i++){
  648. if( fts5PorterIsVowel(zStem[i], i>0) ){
  649. return 1;
  650. }
  651. }
  652. return 0;
  653. }
  654. /**************************************************************************
  655. ***************************************************************************
  656. ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
  657. */
  658. static int fts5PorterStep4(char *aBuf, int *pnBuf){
  659. int ret = 0;
  660. int nBuf = *pnBuf;
  661. switch( aBuf[nBuf-2] ){
  662. case 'a':
  663. if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
  664. if( fts5Porter_MGt1(aBuf, nBuf-2) ){
  665. *pnBuf = nBuf - 2;
  666. }
  667. }
  668. break;
  669. case 'c':
  670. if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
  671. if( fts5Porter_MGt1(aBuf, nBuf-4) ){
  672. *pnBuf = nBuf - 4;
  673. }
  674. }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
  675. if( fts5Porter_MGt1(aBuf, nBuf-4) ){
  676. *pnBuf = nBuf - 4;
  677. }
  678. }
  679. break;
  680. case 'e':
  681. if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
  682. if( fts5Porter_MGt1(aBuf, nBuf-2) ){
  683. *pnBuf = nBuf - 2;
  684. }
  685. }
  686. break;
  687. case 'i':
  688. if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
  689. if( fts5Porter_MGt1(aBuf, nBuf-2) ){
  690. *pnBuf = nBuf - 2;
  691. }
  692. }
  693. break;
  694. case 'l':
  695. if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
  696. if( fts5Porter_MGt1(aBuf, nBuf-4) ){
  697. *pnBuf = nBuf - 4;
  698. }
  699. }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
  700. if( fts5Porter_MGt1(aBuf, nBuf-4) ){
  701. *pnBuf = nBuf - 4;
  702. }
  703. }
  704. break;
  705. case 'n':
  706. if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
  707. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  708. *pnBuf = nBuf - 3;
  709. }
  710. }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
  711. if( fts5Porter_MGt1(aBuf, nBuf-5) ){
  712. *pnBuf = nBuf - 5;
  713. }
  714. }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
  715. if( fts5Porter_MGt1(aBuf, nBuf-4) ){
  716. *pnBuf = nBuf - 4;
  717. }
  718. }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
  719. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  720. *pnBuf = nBuf - 3;
  721. }
  722. }
  723. break;
  724. case 'o':
  725. if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
  726. if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
  727. *pnBuf = nBuf - 3;
  728. }
  729. }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
  730. if( fts5Porter_MGt1(aBuf, nBuf-2) ){
  731. *pnBuf = nBuf - 2;
  732. }
  733. }
  734. break;
  735. case 's':
  736. if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
  737. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  738. *pnBuf = nBuf - 3;
  739. }
  740. }
  741. break;
  742. case 't':
  743. if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
  744. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  745. *pnBuf = nBuf - 3;
  746. }
  747. }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
  748. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  749. *pnBuf = nBuf - 3;
  750. }
  751. }
  752. break;
  753. case 'u':
  754. if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
  755. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  756. *pnBuf = nBuf - 3;
  757. }
  758. }
  759. break;
  760. case 'v':
  761. if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
  762. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  763. *pnBuf = nBuf - 3;
  764. }
  765. }
  766. break;
  767. case 'z':
  768. if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
  769. if( fts5Porter_MGt1(aBuf, nBuf-3) ){
  770. *pnBuf = nBuf - 3;
  771. }
  772. }
  773. break;
  774. }
  775. return ret;
  776. }
  777. static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
  778. int ret = 0;
  779. int nBuf = *pnBuf;
  780. switch( aBuf[nBuf-2] ){
  781. case 'a':
  782. if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
  783. memcpy(&aBuf[nBuf-2], "ate", 3);
  784. *pnBuf = nBuf - 2 + 3;
  785. ret = 1;
  786. }
  787. break;
  788. case 'b':
  789. if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
  790. memcpy(&aBuf[nBuf-2], "ble", 3);
  791. *pnBuf = nBuf - 2 + 3;
  792. ret = 1;
  793. }
  794. break;
  795. case 'i':
  796. if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
  797. memcpy(&aBuf[nBuf-2], "ize", 3);
  798. *pnBuf = nBuf - 2 + 3;
  799. ret = 1;
  800. }
  801. break;
  802. }
  803. return ret;
  804. }
  805. static int fts5PorterStep2(char *aBuf, int *pnBuf){
  806. int ret = 0;
  807. int nBuf = *pnBuf;
  808. switch( aBuf[nBuf-2] ){
  809. case 'a':
  810. if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
  811. if( fts5Porter_MGt0(aBuf, nBuf-7) ){
  812. memcpy(&aBuf[nBuf-7], "ate", 3);
  813. *pnBuf = nBuf - 7 + 3;
  814. }
  815. }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
  816. if( fts5Porter_MGt0(aBuf, nBuf-6) ){
  817. memcpy(&aBuf[nBuf-6], "tion", 4);
  818. *pnBuf = nBuf - 6 + 4;
  819. }
  820. }
  821. break;
  822. case 'c':
  823. if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
  824. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  825. memcpy(&aBuf[nBuf-4], "ence", 4);
  826. *pnBuf = nBuf - 4 + 4;
  827. }
  828. }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
  829. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  830. memcpy(&aBuf[nBuf-4], "ance", 4);
  831. *pnBuf = nBuf - 4 + 4;
  832. }
  833. }
  834. break;
  835. case 'e':
  836. if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
  837. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  838. memcpy(&aBuf[nBuf-4], "ize", 3);
  839. *pnBuf = nBuf - 4 + 3;
  840. }
  841. }
  842. break;
  843. case 'g':
  844. if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
  845. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  846. memcpy(&aBuf[nBuf-4], "log", 3);
  847. *pnBuf = nBuf - 4 + 3;
  848. }
  849. }
  850. break;
  851. case 'l':
  852. if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
  853. if( fts5Porter_MGt0(aBuf, nBuf-3) ){
  854. memcpy(&aBuf[nBuf-3], "ble", 3);
  855. *pnBuf = nBuf - 3 + 3;
  856. }
  857. }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
  858. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  859. memcpy(&aBuf[nBuf-4], "al", 2);
  860. *pnBuf = nBuf - 4 + 2;
  861. }
  862. }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
  863. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  864. memcpy(&aBuf[nBuf-5], "ent", 3);
  865. *pnBuf = nBuf - 5 + 3;
  866. }
  867. }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
  868. if( fts5Porter_MGt0(aBuf, nBuf-3) ){
  869. memcpy(&aBuf[nBuf-3], "e", 1);
  870. *pnBuf = nBuf - 3 + 1;
  871. }
  872. }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
  873. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  874. memcpy(&aBuf[nBuf-5], "ous", 3);
  875. *pnBuf = nBuf - 5 + 3;
  876. }
  877. }
  878. break;
  879. case 'o':
  880. if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
  881. if( fts5Porter_MGt0(aBuf, nBuf-7) ){
  882. memcpy(&aBuf[nBuf-7], "ize", 3);
  883. *pnBuf = nBuf - 7 + 3;
  884. }
  885. }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
  886. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  887. memcpy(&aBuf[nBuf-5], "ate", 3);
  888. *pnBuf = nBuf - 5 + 3;
  889. }
  890. }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
  891. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  892. memcpy(&aBuf[nBuf-4], "ate", 3);
  893. *pnBuf = nBuf - 4 + 3;
  894. }
  895. }
  896. break;
  897. case 's':
  898. if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
  899. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  900. memcpy(&aBuf[nBuf-5], "al", 2);
  901. *pnBuf = nBuf - 5 + 2;
  902. }
  903. }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
  904. if( fts5Porter_MGt0(aBuf, nBuf-7) ){
  905. memcpy(&aBuf[nBuf-7], "ive", 3);
  906. *pnBuf = nBuf - 7 + 3;
  907. }
  908. }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
  909. if( fts5Porter_MGt0(aBuf, nBuf-7) ){
  910. memcpy(&aBuf[nBuf-7], "ful", 3);
  911. *pnBuf = nBuf - 7 + 3;
  912. }
  913. }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
  914. if( fts5Porter_MGt0(aBuf, nBuf-7) ){
  915. memcpy(&aBuf[nBuf-7], "ous", 3);
  916. *pnBuf = nBuf - 7 + 3;
  917. }
  918. }
  919. break;
  920. case 't':
  921. if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
  922. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  923. memcpy(&aBuf[nBuf-5], "al", 2);
  924. *pnBuf = nBuf - 5 + 2;
  925. }
  926. }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
  927. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  928. memcpy(&aBuf[nBuf-5], "ive", 3);
  929. *pnBuf = nBuf - 5 + 3;
  930. }
  931. }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
  932. if( fts5Porter_MGt0(aBuf, nBuf-6) ){
  933. memcpy(&aBuf[nBuf-6], "ble", 3);
  934. *pnBuf = nBuf - 6 + 3;
  935. }
  936. }
  937. break;
  938. }
  939. return ret;
  940. }
  941. static int fts5PorterStep3(char *aBuf, int *pnBuf){
  942. int ret = 0;
  943. int nBuf = *pnBuf;
  944. switch( aBuf[nBuf-2] ){
  945. case 'a':
  946. if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
  947. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  948. memcpy(&aBuf[nBuf-4], "ic", 2);
  949. *pnBuf = nBuf - 4 + 2;
  950. }
  951. }
  952. break;
  953. case 's':
  954. if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
  955. if( fts5Porter_MGt0(aBuf, nBuf-4) ){
  956. *pnBuf = nBuf - 4;
  957. }
  958. }
  959. break;
  960. case 't':
  961. if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
  962. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  963. memcpy(&aBuf[nBuf-5], "ic", 2);
  964. *pnBuf = nBuf - 5 + 2;
  965. }
  966. }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
  967. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  968. memcpy(&aBuf[nBuf-5], "ic", 2);
  969. *pnBuf = nBuf - 5 + 2;
  970. }
  971. }
  972. break;
  973. case 'u':
  974. if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
  975. if( fts5Porter_MGt0(aBuf, nBuf-3) ){
  976. *pnBuf = nBuf - 3;
  977. }
  978. }
  979. break;
  980. case 'v':
  981. if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
  982. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  983. *pnBuf = nBuf - 5;
  984. }
  985. }
  986. break;
  987. case 'z':
  988. if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
  989. if( fts5Porter_MGt0(aBuf, nBuf-5) ){
  990. memcpy(&aBuf[nBuf-5], "al", 2);
  991. *pnBuf = nBuf - 5 + 2;
  992. }
  993. }
  994. break;
  995. }
  996. return ret;
  997. }
  998. static int fts5PorterStep1B(char *aBuf, int *pnBuf){
  999. int ret = 0;
  1000. int nBuf = *pnBuf;
  1001. switch( aBuf[nBuf-2] ){
  1002. case 'e':
  1003. if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
  1004. if( fts5Porter_MGt0(aBuf, nBuf-3) ){
  1005. memcpy(&aBuf[nBuf-3], "ee", 2);
  1006. *pnBuf = nBuf - 3 + 2;
  1007. }
  1008. }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
  1009. if( fts5Porter_Vowel(aBuf, nBuf-2) ){
  1010. *pnBuf = nBuf - 2;
  1011. ret = 1;
  1012. }
  1013. }
  1014. break;
  1015. case 'n':
  1016. if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
  1017. if( fts5Porter_Vowel(aBuf, nBuf-3) ){
  1018. *pnBuf = nBuf - 3;
  1019. ret = 1;
  1020. }
  1021. }
  1022. break;
  1023. }
  1024. return ret;
  1025. }
  1026. /*
  1027. ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
  1028. ***************************************************************************
  1029. **************************************************************************/
  1030. static void fts5PorterStep1A(char *aBuf, int *pnBuf){
  1031. int nBuf = *pnBuf;
  1032. if( aBuf[nBuf-1]=='s' ){
  1033. if( aBuf[nBuf-2]=='e' ){
  1034. if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
  1035. || (nBuf>3 && aBuf[nBuf-3]=='i' )
  1036. ){
  1037. *pnBuf = nBuf-2;
  1038. }else{
  1039. *pnBuf = nBuf-1;
  1040. }
  1041. }
  1042. else if( aBuf[nBuf-2]!='s' ){
  1043. *pnBuf = nBuf-1;
  1044. }
  1045. }
  1046. }
  1047. static int fts5PorterCb(
  1048. void *pCtx,
  1049. int tflags,
  1050. const char *pToken,
  1051. int nToken,
  1052. int iStart,
  1053. int iEnd
  1054. ){
  1055. PorterContext *p = (PorterContext*)pCtx;
  1056. char *aBuf;
  1057. int nBuf;
  1058. if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
  1059. aBuf = p->aBuf;
  1060. nBuf = nToken;
  1061. memcpy(aBuf, pToken, nBuf);
  1062. /* Step 1. */
  1063. fts5PorterStep1A(aBuf, &nBuf);
  1064. if( fts5PorterStep1B(aBuf, &nBuf) ){
  1065. if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
  1066. char c = aBuf[nBuf-1];
  1067. if( fts5PorterIsVowel(c, 0)==0
  1068. && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
  1069. ){
  1070. nBuf--;
  1071. }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
  1072. aBuf[nBuf++] = 'e';
  1073. }
  1074. }
  1075. }
  1076. /* Step 1C. */
  1077. if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
  1078. aBuf[nBuf-1] = 'i';
  1079. }
  1080. /* Steps 2 through 4. */
  1081. fts5PorterStep2(aBuf, &nBuf);
  1082. fts5PorterStep3(aBuf, &nBuf);
  1083. fts5PorterStep4(aBuf, &nBuf);
  1084. /* Step 5a. */
  1085. assert( nBuf>0 );
  1086. if( aBuf[nBuf-1]=='e' ){
  1087. if( fts5Porter_MGt1(aBuf, nBuf-1)
  1088. || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
  1089. ){
  1090. nBuf--;
  1091. }
  1092. }
  1093. /* Step 5b. */
  1094. if( nBuf>1 && aBuf[nBuf-1]=='l'
  1095. && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
  1096. ){
  1097. nBuf--;
  1098. }
  1099. return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
  1100. pass_through:
  1101. return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
  1102. }
  1103. /*
  1104. ** Tokenize using the porter tokenizer.
  1105. */
  1106. static int fts5PorterTokenize(
  1107. Fts5Tokenizer *pTokenizer,
  1108. void *pCtx,
  1109. int flags,
  1110. const char *pText, int nText,
  1111. const char *pLoc, int nLoc,
  1112. int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
  1113. ){
  1114. PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
  1115. PorterContext sCtx;
  1116. sCtx.xToken = xToken;
  1117. sCtx.pCtx = pCtx;
  1118. sCtx.aBuf = p->aBuf;
  1119. return p->tokenizer_v2.xTokenize(
  1120. p->pTokenizer, (void*)&sCtx, flags, pText, nText, pLoc, nLoc, fts5PorterCb
  1121. );
  1122. }
  1123. /**************************************************************************
  1124. ** Start of trigram implementation.
  1125. */
  1126. typedef struct TrigramTokenizer TrigramTokenizer;
  1127. struct TrigramTokenizer {
  1128. int bFold; /* True to fold to lower-case */
  1129. int iFoldParam; /* Parameter to pass to Fts5UnicodeFold() */
  1130. };
  1131. /*
  1132. ** Free a trigram tokenizer.
  1133. */
  1134. static void fts5TriDelete(Fts5Tokenizer *p){
  1135. sqlite3_free(p);
  1136. }
  1137. /*
  1138. ** Allocate a trigram tokenizer.
  1139. */
  1140. static int fts5TriCreate(
  1141. void *pUnused,
  1142. const char **azArg,
  1143. int nArg,
  1144. Fts5Tokenizer **ppOut
  1145. ){
  1146. int rc = SQLITE_OK;
  1147. TrigramTokenizer *pNew = 0;
  1148. UNUSED_PARAM(pUnused);
  1149. if( nArg%2 ){
  1150. rc = SQLITE_ERROR;
  1151. }else{
  1152. int i;
  1153. pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
  1154. if( pNew==0 ){
  1155. rc = SQLITE_NOMEM;
  1156. }else{
  1157. pNew->bFold = 1;
  1158. pNew->iFoldParam = 0;
  1159. for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
  1160. const char *zArg = azArg[i+1];
  1161. if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
  1162. if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
  1163. rc = SQLITE_ERROR;
  1164. }else{
  1165. pNew->bFold = (zArg[0]=='0');
  1166. }
  1167. }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
  1168. if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
  1169. rc = SQLITE_ERROR;
  1170. }else{
  1171. pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
  1172. }
  1173. }else{
  1174. rc = SQLITE_ERROR;
  1175. }
  1176. }
  1177. if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
  1178. rc = SQLITE_ERROR;
  1179. }
  1180. if( rc!=SQLITE_OK ){
  1181. fts5TriDelete((Fts5Tokenizer*)pNew);
  1182. pNew = 0;
  1183. }
  1184. }
  1185. }
  1186. *ppOut = (Fts5Tokenizer*)pNew;
  1187. return rc;
  1188. }
  1189. /*
  1190. ** Trigram tokenizer tokenize routine.
  1191. */
  1192. static int fts5TriTokenize(
  1193. Fts5Tokenizer *pTok,
  1194. void *pCtx,
  1195. int unusedFlags,
  1196. const char *pText, int nText,
  1197. int (*xToken)(void*, int, const char*, int, int, int)
  1198. ){
  1199. TrigramTokenizer *p = (TrigramTokenizer*)pTok;
  1200. int rc = SQLITE_OK;
  1201. char aBuf[32];
  1202. char *zOut = aBuf;
  1203. int ii;
  1204. const unsigned char *zIn = (const unsigned char*)pText;
  1205. const unsigned char *zEof = (zIn ? &zIn[nText] : 0);
  1206. u32 iCode = 0;
  1207. int aStart[3]; /* Input offset of each character in aBuf[] */
  1208. UNUSED_PARAM(unusedFlags);
  1209. /* Populate aBuf[] with the characters for the first trigram. */
  1210. for(ii=0; ii<3; ii++){
  1211. do {
  1212. aStart[ii] = zIn - (const unsigned char*)pText;
  1213. if( zIn>=zEof ) return SQLITE_OK;
  1214. READ_UTF8(zIn, zEof, iCode);
  1215. if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
  1216. }while( iCode==0 );
  1217. WRITE_UTF8(zOut, iCode);
  1218. }
  1219. /* At the start of each iteration of this loop:
  1220. **
  1221. ** aBuf: Contains 3 characters. The 3 characters of the next trigram.
  1222. ** zOut: Points to the byte following the last character in aBuf.
  1223. ** aStart[3]: Contains the byte offset in the input text corresponding
  1224. ** to the start of each of the three characters in the buffer.
  1225. */
  1226. assert( zIn<=zEof );
  1227. while( 1 ){
  1228. int iNext; /* Start of character following current tri */
  1229. const char *z1;
  1230. /* Read characters from the input up until the first non-diacritic */
  1231. do {
  1232. iNext = zIn - (const unsigned char*)pText;
  1233. if( zIn>=zEof ){
  1234. iCode = 0;
  1235. break;
  1236. }
  1237. READ_UTF8(zIn, zEof, iCode);
  1238. if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
  1239. }while( iCode==0 );
  1240. /* Pass the current trigram back to fts5 */
  1241. rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
  1242. if( iCode==0 || rc!=SQLITE_OK ) break;
  1243. /* Remove the first character from buffer aBuf[]. Append the character
  1244. ** with codepoint iCode. */
  1245. z1 = aBuf;
  1246. FTS5_SKIP_UTF8(z1);
  1247. memmove(aBuf, z1, zOut - z1);
  1248. zOut -= (z1 - aBuf);
  1249. WRITE_UTF8(zOut, iCode);
  1250. /* Update the aStart[] array */
  1251. aStart[0] = aStart[1];
  1252. aStart[1] = aStart[2];
  1253. aStart[2] = iNext;
  1254. }
  1255. return rc;
  1256. }
  1257. /*
  1258. ** Argument xCreate is a pointer to a constructor function for a tokenizer.
  1259. ** pTok is a tokenizer previously created using the same method. This function
  1260. ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
  1261. ** indicating the style of pattern matching that the tokenizer can support.
  1262. ** In practice, this is:
  1263. **
  1264. ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
  1265. ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
  1266. ** all other tokenizers - FTS5_PATTERN_NONE
  1267. */
  1268. int sqlite3Fts5TokenizerPattern(
  1269. int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
  1270. Fts5Tokenizer *pTok
  1271. ){
  1272. if( xCreate==fts5TriCreate ){
  1273. TrigramTokenizer *p = (TrigramTokenizer*)pTok;
  1274. if( p->iFoldParam==0 ){
  1275. return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
  1276. }
  1277. }
  1278. return FTS5_PATTERN_NONE;
  1279. }
  1280. /*
  1281. ** Return true if the tokenizer described by p->azArg[] is the trigram
  1282. ** tokenizer. This tokenizer needs to be loaded before xBestIndex is
  1283. ** called for the first time in order to correctly handle LIKE/GLOB.
  1284. */
  1285. int sqlite3Fts5TokenizerPreload(Fts5TokenizerConfig *p){
  1286. return (p->nArg>=1 && 0==sqlite3_stricmp(p->azArg[0], "trigram"));
  1287. }
  1288. /*
  1289. ** Register all built-in tokenizers with FTS5.
  1290. */
  1291. int sqlite3Fts5TokenizerInit(fts5_api *pApi){
  1292. struct BuiltinTokenizer {
  1293. const char *zName;
  1294. fts5_tokenizer x;
  1295. } aBuiltin[] = {
  1296. { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
  1297. { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
  1298. { "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
  1299. };
  1300. int rc = SQLITE_OK; /* Return code */
  1301. int i; /* To iterate through builtin functions */
  1302. for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
  1303. rc = pApi->xCreateTokenizer(pApi,
  1304. aBuiltin[i].zName,
  1305. (void*)pApi,
  1306. &aBuiltin[i].x,
  1307. 0
  1308. );
  1309. }
  1310. if( rc==SQLITE_OK ){
  1311. fts5_tokenizer_v2 sPorter = {
  1312. 2,
  1313. fts5PorterCreate,
  1314. fts5PorterDelete,
  1315. fts5PorterTokenize
  1316. };
  1317. rc = pApi->xCreateTokenizer_v2(pApi,
  1318. "porter",
  1319. (void*)pApi,
  1320. &sPorter,
  1321. 0
  1322. );
  1323. }
  1324. return rc;
  1325. }