fts5_test_tok.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. /*
  2. ** 2013 Apr 22
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** This file contains code for the "fts5tokenize" virtual table module.
  14. ** An fts5tokenize virtual table is created as follows:
  15. **
  16. ** CREATE VIRTUAL TABLE <tbl> USING fts5tokenize(
  17. ** <tokenizer-name>, <arg-1>, ...
  18. ** );
  19. **
  20. ** The table created has the following schema:
  21. **
  22. ** CREATE TABLE <tbl>(input HIDDEN, token, start, end, position)
  23. **
  24. ** When queried, the query must include a WHERE clause of type:
  25. **
  26. ** input = <string>
  27. **
  28. ** The virtual table module tokenizes this <string>, using the FTS3
  29. ** tokenizer specified by the arguments to the CREATE VIRTUAL TABLE
  30. ** statement and returns one row for each token in the result. With
  31. ** fields set as follows:
  32. **
  33. ** input: Always set to a copy of <string>
  34. ** token: A token from the input.
  35. ** start: Byte offset of the token within the input <string>.
  36. ** end: Byte offset of the byte immediately following the end of the
  37. ** token within the input string.
  38. ** pos: Token offset of token within input.
  39. **
  40. */
  41. #if defined(SQLITE_TEST) && defined(SQLITE_ENABLE_FTS5)
  42. #include "fts5.h"
  43. #include <string.h>
  44. #include <assert.h>
  45. typedef struct Fts5tokTable Fts5tokTable;
  46. typedef struct Fts5tokCursor Fts5tokCursor;
  47. typedef struct Fts5tokRow Fts5tokRow;
  48. /*
  49. ** Virtual table structure.
  50. */
  51. struct Fts5tokTable {
  52. sqlite3_vtab base; /* Base class used by SQLite core */
  53. fts5_tokenizer tok; /* Tokenizer functions */
  54. Fts5Tokenizer *pTok; /* Tokenizer instance */
  55. };
  56. /*
  57. ** A container for a rows values.
  58. */
  59. struct Fts5tokRow {
  60. char *zToken;
  61. int iStart;
  62. int iEnd;
  63. int iPos;
  64. };
  65. /*
  66. ** Virtual table cursor structure.
  67. */
  68. struct Fts5tokCursor {
  69. sqlite3_vtab_cursor base; /* Base class used by SQLite core */
  70. int iRowid; /* Current 'rowid' value */
  71. char *zInput; /* Input string */
  72. int nRow; /* Number of entries in aRow[] */
  73. Fts5tokRow *aRow; /* Array of rows to return */
  74. };
  75. static void fts5tokDequote(char *z){
  76. char q = z[0];
  77. if( q=='[' || q=='\'' || q=='"' || q=='`' ){
  78. int iIn = 1;
  79. int iOut = 0;
  80. if( q=='[' ) q = ']';
  81. while( z[iIn] ){
  82. if( z[iIn]==q ){
  83. if( z[iIn+1]!=q ){
  84. /* Character iIn was the close quote. */
  85. iIn++;
  86. break;
  87. }else{
  88. /* Character iIn and iIn+1 form an escaped quote character. Skip
  89. ** the input cursor past both and copy a single quote character
  90. ** to the output buffer. */
  91. iIn += 2;
  92. z[iOut++] = q;
  93. }
  94. }else{
  95. z[iOut++] = z[iIn++];
  96. }
  97. }
  98. z[iOut] = '\0';
  99. }
  100. }
  101. /*
  102. ** The second argument, argv[], is an array of pointers to nul-terminated
  103. ** strings. This function makes a copy of the array and strings into a
  104. ** single block of memory. It then dequotes any of the strings that appear
  105. ** to be quoted.
  106. **
  107. ** If successful, output parameter *pazDequote is set to point at the
  108. ** array of dequoted strings and SQLITE_OK is returned. The caller is
  109. ** responsible for eventually calling sqlite3_free() to free the array
  110. ** in this case. Or, if an error occurs, an SQLite error code is returned.
  111. ** The final value of *pazDequote is undefined in this case.
  112. */
  113. static int fts5tokDequoteArray(
  114. int argc, /* Number of elements in argv[] */
  115. const char * const *argv, /* Input array */
  116. char ***pazDequote /* Output array */
  117. ){
  118. int rc = SQLITE_OK; /* Return code */
  119. if( argc==0 ){
  120. *pazDequote = 0;
  121. }else{
  122. int i;
  123. int nByte = 0;
  124. char **azDequote;
  125. for(i=0; i<argc; i++){
  126. nByte += (int)(strlen(argv[i]) + 1);
  127. }
  128. *pazDequote = azDequote = sqlite3_malloc64(sizeof(char *)*argc + nByte);
  129. if( azDequote==0 ){
  130. rc = SQLITE_NOMEM;
  131. }else{
  132. char *pSpace = (char *)&azDequote[argc];
  133. for(i=0; i<argc; i++){
  134. int n = (int)strlen(argv[i]);
  135. azDequote[i] = pSpace;
  136. memcpy(pSpace, argv[i], n+1);
  137. fts5tokDequote(pSpace);
  138. pSpace += (n+1);
  139. }
  140. }
  141. }
  142. return rc;
  143. }
  144. /*
  145. ** Schema of the tokenizer table.
  146. */
  147. #define FTS3_TOK_SCHEMA "CREATE TABLE x(input HIDDEN, token, start, end, position)"
  148. /*
  149. ** This function does all the work for both the xConnect and xCreate methods.
  150. ** These tables have no persistent representation of their own, so xConnect
  151. ** and xCreate are identical operations.
  152. **
  153. ** argv[0]: module name
  154. ** argv[1]: database name
  155. ** argv[2]: table name
  156. ** argv[3]: first argument (tokenizer name)
  157. */
  158. static int fts5tokConnectMethod(
  159. sqlite3 *db, /* Database connection */
  160. void *pCtx, /* Pointer to fts5_api object */
  161. int argc, /* Number of elements in argv array */
  162. const char * const *argv, /* xCreate/xConnect argument array */
  163. sqlite3_vtab **ppVtab, /* OUT: New sqlite3_vtab object */
  164. char **pzErr /* OUT: sqlite3_malloc'd error message */
  165. ){
  166. fts5_api *pApi = (fts5_api*)pCtx;
  167. Fts5tokTable *pTab = 0;
  168. int rc;
  169. char **azDequote = 0;
  170. int nDequote = 0;
  171. rc = sqlite3_declare_vtab(db,
  172. "CREATE TABLE x(input HIDDEN, token, start, end, position)"
  173. );
  174. if( rc==SQLITE_OK ){
  175. nDequote = argc-3;
  176. rc = fts5tokDequoteArray(nDequote, &argv[3], &azDequote);
  177. }
  178. if( rc==SQLITE_OK ){
  179. pTab = (Fts5tokTable*)sqlite3_malloc(sizeof(Fts5tokTable));
  180. if( pTab==0 ){
  181. rc = SQLITE_NOMEM;
  182. }else{
  183. memset(pTab, 0, sizeof(Fts5tokTable));
  184. }
  185. }
  186. if( rc==SQLITE_OK ){
  187. void *pTokCtx = 0;
  188. const char *zModule = 0;
  189. if( nDequote>0 ){
  190. zModule = azDequote[0];
  191. }
  192. rc = pApi->xFindTokenizer(pApi, zModule, &pTokCtx, &pTab->tok);
  193. if( rc==SQLITE_OK ){
  194. const char **azArg = (nDequote>1 ? (const char **)&azDequote[1] : 0);
  195. int nArg = nDequote>0 ? nDequote-1 : 0;
  196. rc = pTab->tok.xCreate(pTokCtx, azArg, nArg, &pTab->pTok);
  197. }
  198. }
  199. if( rc!=SQLITE_OK ){
  200. sqlite3_free(pTab);
  201. pTab = 0;
  202. }
  203. *ppVtab = (sqlite3_vtab*)pTab;
  204. sqlite3_free(azDequote);
  205. return rc;
  206. }
  207. /*
  208. ** This function does the work for both the xDisconnect and xDestroy methods.
  209. ** These tables have no persistent representation of their own, so xDisconnect
  210. ** and xDestroy are identical operations.
  211. */
  212. static int fts5tokDisconnectMethod(sqlite3_vtab *pVtab){
  213. Fts5tokTable *pTab = (Fts5tokTable *)pVtab;
  214. if( pTab->pTok ){
  215. pTab->tok.xDelete(pTab->pTok);
  216. }
  217. sqlite3_free(pTab);
  218. return SQLITE_OK;
  219. }
  220. /*
  221. ** xBestIndex - Analyze a WHERE and ORDER BY clause.
  222. */
  223. static int fts5tokBestIndexMethod(
  224. sqlite3_vtab *pVTab,
  225. sqlite3_index_info *pInfo
  226. ){
  227. int i;
  228. for(i=0; i<pInfo->nConstraint; i++){
  229. if( pInfo->aConstraint[i].usable
  230. && pInfo->aConstraint[i].iColumn==0
  231. && pInfo->aConstraint[i].op==SQLITE_INDEX_CONSTRAINT_EQ
  232. ){
  233. pInfo->idxNum = 1;
  234. pInfo->aConstraintUsage[i].argvIndex = 1;
  235. pInfo->aConstraintUsage[i].omit = 1;
  236. pInfo->estimatedCost = 1;
  237. return SQLITE_OK;
  238. }
  239. }
  240. pInfo->idxNum = 0;
  241. assert( pInfo->estimatedCost>1000000.0 );
  242. return SQLITE_OK;
  243. }
  244. /*
  245. ** xOpen - Open a cursor.
  246. */
  247. static int fts5tokOpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){
  248. Fts5tokCursor *pCsr;
  249. pCsr = (Fts5tokCursor *)sqlite3_malloc(sizeof(Fts5tokCursor));
  250. if( pCsr==0 ){
  251. return SQLITE_NOMEM;
  252. }
  253. memset(pCsr, 0, sizeof(Fts5tokCursor));
  254. *ppCsr = (sqlite3_vtab_cursor *)pCsr;
  255. return SQLITE_OK;
  256. }
  257. /*
  258. ** Reset the tokenizer cursor passed as the only argument. As if it had
  259. ** just been returned by fts5tokOpenMethod().
  260. */
  261. static void fts5tokResetCursor(Fts5tokCursor *pCsr){
  262. int i;
  263. for(i=0; i<pCsr->nRow; i++){
  264. sqlite3_free(pCsr->aRow[i].zToken);
  265. }
  266. sqlite3_free(pCsr->zInput);
  267. sqlite3_free(pCsr->aRow);
  268. pCsr->zInput = 0;
  269. pCsr->aRow = 0;
  270. pCsr->nRow = 0;
  271. pCsr->iRowid = 0;
  272. }
  273. /*
  274. ** xClose - Close a cursor.
  275. */
  276. static int fts5tokCloseMethod(sqlite3_vtab_cursor *pCursor){
  277. Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
  278. fts5tokResetCursor(pCsr);
  279. sqlite3_free(pCsr);
  280. return SQLITE_OK;
  281. }
  282. /*
  283. ** xNext - Advance the cursor to the next row, if any.
  284. */
  285. static int fts5tokNextMethod(sqlite3_vtab_cursor *pCursor){
  286. Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
  287. pCsr->iRowid++;
  288. return SQLITE_OK;
  289. }
  290. static int fts5tokCb(
  291. void *pCtx, /* Pointer to Fts5tokCursor */
  292. int tflags, /* Mask of FTS5_TOKEN_* flags */
  293. const char *pToken, /* Pointer to buffer containing token */
  294. int nToken, /* Size of token in bytes */
  295. int iStart, /* Byte offset of token within input text */
  296. int iEnd /* Byte offset of end of token within input text */
  297. ){
  298. Fts5tokCursor *pCsr = (Fts5tokCursor*)pCtx;
  299. Fts5tokRow *pRow;
  300. if( (pCsr->nRow & (pCsr->nRow-1))==0 ){
  301. int nNew = pCsr->nRow ? pCsr->nRow*2 : 32;
  302. Fts5tokRow *aNew;
  303. aNew = (Fts5tokRow*)sqlite3_realloc64(pCsr->aRow, nNew*sizeof(Fts5tokRow));
  304. if( aNew==0 ) return SQLITE_NOMEM;
  305. memset(&aNew[pCsr->nRow], 0, sizeof(Fts5tokRow)*(nNew-pCsr->nRow));
  306. pCsr->aRow = aNew;
  307. }
  308. pRow = &pCsr->aRow[pCsr->nRow];
  309. pRow->iStart = iStart;
  310. pRow->iEnd = iEnd;
  311. if( pCsr->nRow ){
  312. pRow->iPos = pRow[-1].iPos + ((tflags & FTS5_TOKEN_COLOCATED) ? 0 : 1);
  313. }
  314. pRow->zToken = sqlite3_malloc(nToken+1);
  315. if( pRow->zToken==0 ) return SQLITE_NOMEM;
  316. memcpy(pRow->zToken, pToken, nToken);
  317. pRow->zToken[nToken] = 0;
  318. pCsr->nRow++;
  319. return SQLITE_OK;
  320. }
  321. /*
  322. ** xFilter - Initialize a cursor to point at the start of its data.
  323. */
  324. static int fts5tokFilterMethod(
  325. sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
  326. int idxNum, /* Strategy index */
  327. const char *idxStr, /* Unused */
  328. int nVal, /* Number of elements in apVal */
  329. sqlite3_value **apVal /* Arguments for the indexing scheme */
  330. ){
  331. int rc = SQLITE_ERROR;
  332. Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
  333. Fts5tokTable *pTab = (Fts5tokTable *)(pCursor->pVtab);
  334. fts5tokResetCursor(pCsr);
  335. if( idxNum==1 ){
  336. const char *zByte = (const char *)sqlite3_value_text(apVal[0]);
  337. int nByte = sqlite3_value_bytes(apVal[0]);
  338. pCsr->zInput = sqlite3_malloc(nByte+1);
  339. if( pCsr->zInput==0 ){
  340. rc = SQLITE_NOMEM;
  341. }else{
  342. if( nByte>0 ) memcpy(pCsr->zInput, zByte, nByte);
  343. pCsr->zInput[nByte] = 0;
  344. rc = pTab->tok.xTokenize(
  345. pTab->pTok, (void*)pCsr, 0, zByte, nByte, fts5tokCb
  346. );
  347. }
  348. }
  349. if( rc!=SQLITE_OK ) return rc;
  350. return fts5tokNextMethod(pCursor);
  351. }
  352. /*
  353. ** xEof - Return true if the cursor is at EOF, or false otherwise.
  354. */
  355. static int fts5tokEofMethod(sqlite3_vtab_cursor *pCursor){
  356. Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
  357. return (pCsr->iRowid>pCsr->nRow);
  358. }
  359. /*
  360. ** xColumn - Return a column value.
  361. */
  362. static int fts5tokColumnMethod(
  363. sqlite3_vtab_cursor *pCursor, /* Cursor to retrieve value from */
  364. sqlite3_context *pCtx, /* Context for sqlite3_result_xxx() calls */
  365. int iCol /* Index of column to read value from */
  366. ){
  367. Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
  368. Fts5tokRow *pRow = &pCsr->aRow[pCsr->iRowid-1];
  369. /* CREATE TABLE x(input, token, start, end, position) */
  370. switch( iCol ){
  371. case 0:
  372. sqlite3_result_text(pCtx, pCsr->zInput, -1, SQLITE_TRANSIENT);
  373. break;
  374. case 1:
  375. sqlite3_result_text(pCtx, pRow->zToken, -1, SQLITE_TRANSIENT);
  376. break;
  377. case 2:
  378. sqlite3_result_int(pCtx, pRow->iStart);
  379. break;
  380. case 3:
  381. sqlite3_result_int(pCtx, pRow->iEnd);
  382. break;
  383. default:
  384. assert( iCol==4 );
  385. sqlite3_result_int(pCtx, pRow->iPos);
  386. break;
  387. }
  388. return SQLITE_OK;
  389. }
  390. /*
  391. ** xRowid - Return the current rowid for the cursor.
  392. */
  393. static int fts5tokRowidMethod(
  394. sqlite3_vtab_cursor *pCursor, /* Cursor to retrieve value from */
  395. sqlite_int64 *pRowid /* OUT: Rowid value */
  396. ){
  397. Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
  398. *pRowid = (sqlite3_int64)pCsr->iRowid;
  399. return SQLITE_OK;
  400. }
  401. /*
  402. ** Register the fts5tok module with database connection db. Return SQLITE_OK
  403. ** if successful or an error code if sqlite3_create_module() fails.
  404. */
  405. int sqlite3Fts5TestRegisterTok(sqlite3 *db, fts5_api *pApi){
  406. static const sqlite3_module fts5tok_module = {
  407. 0, /* iVersion */
  408. fts5tokConnectMethod, /* xCreate */
  409. fts5tokConnectMethod, /* xConnect */
  410. fts5tokBestIndexMethod, /* xBestIndex */
  411. fts5tokDisconnectMethod, /* xDisconnect */
  412. fts5tokDisconnectMethod, /* xDestroy */
  413. fts5tokOpenMethod, /* xOpen */
  414. fts5tokCloseMethod, /* xClose */
  415. fts5tokFilterMethod, /* xFilter */
  416. fts5tokNextMethod, /* xNext */
  417. fts5tokEofMethod, /* xEof */
  418. fts5tokColumnMethod, /* xColumn */
  419. fts5tokRowidMethod, /* xRowid */
  420. 0, /* xUpdate */
  421. 0, /* xBegin */
  422. 0, /* xSync */
  423. 0, /* xCommit */
  424. 0, /* xRollback */
  425. 0, /* xFindFunction */
  426. 0, /* xRename */
  427. 0, /* xSavepoint */
  428. 0, /* xRelease */
  429. 0, /* xRollbackTo */
  430. 0, /* xShadowName */
  431. 0 /* xIntegrity */
  432. };
  433. int rc; /* Return code */
  434. rc = sqlite3_create_module(db, "fts5tokenize", &fts5tok_module, (void*)pApi);
  435. return rc;
  436. }
  437. #endif /* defined(SQLITE_TEST) && defined(SQLITE_ENABLE_FTS5) */