123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- 1. FTS3 Tokenizers
- When creating a new full-text table, FTS3 allows the user to select
- the text tokenizer implementation to be used when indexing text
- by specifying a "tokenize" clause as part of the CREATE VIRTUAL TABLE
- statement:
- CREATE VIRTUAL TABLE <table-name> USING fts3(
- <columns ...> [, tokenize <tokenizer-name> [<tokenizer-args>]]
- );
- The built-in tokenizers (valid values to pass as <tokenizer name>) are
- "simple", "porter" and "unicode".
- <tokenizer-args> should consist of zero or more white-space separated
- arguments to pass to the selected tokenizer implementation. The
- interpretation of the arguments, if any, depends on the individual
- tokenizer.
- 2. Custom Tokenizers
- FTS3 allows users to provide custom tokenizer implementations. The
- interface used to create a new tokenizer is defined and described in
- the fts3_tokenizer.h source file.
- Registering a new FTS3 tokenizer is similar to registering a new
- virtual table module with SQLite. The user passes a pointer to a
- structure containing pointers to various callback functions that
- make up the implementation of the new tokenizer type. For tokenizers,
- the structure (defined in fts3_tokenizer.h) is called
- "sqlite3_tokenizer_module".
- FTS3 does not expose a C-function that users call to register new
- tokenizer types with a database handle. Instead, the pointer must
- be encoded as an SQL blob value and passed to FTS3 through the SQL
- engine by evaluating a special scalar function, "fts3_tokenizer()".
- The fts3_tokenizer() function may be called with one or two arguments,
- as follows:
- SELECT fts3_tokenizer(<tokenizer-name>);
- SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
-
- Where <tokenizer-name> is a string identifying the tokenizer and
- <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
- structure encoded as an SQL blob. If the second argument is present,
- it is registered as tokenizer <tokenizer-name> and a copy of it
- returned. If only one argument is passed, a pointer to the tokenizer
- implementation currently registered as <tokenizer-name> is returned,
- encoded as a blob. Or, if no such tokenizer exists, an SQL exception
- (error) is raised.
- SECURITY: If the fts3 extension is used in an environment where potentially
- malicious users may execute arbitrary SQL (i.e. gears), they should be
- prevented from invoking the fts3_tokenizer() function. The
- fts3_tokenizer() function is disabled by default. It is only enabled
- by SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER. Do not enable it in
- security sensitive environments.
- See "Sample code" below for an example of calling the fts3_tokenizer()
- function from C code.
- 3. ICU Library Tokenizers
- If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
- symbol defined, then there exists a built-in tokenizer named "icu"
- implemented using the ICU library. The first argument passed to the
- xCreate() method (see fts3_tokenizer.h) of this tokenizer may be
- an ICU locale identifier. For example "tr_TR" for Turkish as used
- in Turkey, or "en_AU" for English as used in Australia. For example:
- "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)"
- The ICU tokenizer implementation is very simple. It splits the input
- text according to the ICU rules for finding word boundaries and discards
- any tokens that consist entirely of white-space. This may be suitable
- for some applications in some locales, but not all. If more complex
- processing is required, for example to implement stemming or
- discard punctuation, this can be done by creating a tokenizer
- implementation that uses the ICU tokenizer as part of its implementation.
- When using the ICU tokenizer this way, it is safe to overwrite the
- contents of the strings returned by the xNext() method (see
- fts3_tokenizer.h).
- 4. Sample code.
- The following two code samples illustrate the way C code should invoke
- the fts3_tokenizer() scalar function:
- int registerTokenizer(
- sqlite3 *db,
- char *zName,
- const sqlite3_tokenizer_module *p
- ){
- int rc;
- sqlite3_stmt *pStmt;
- const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
-
- rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
- if( rc!=SQLITE_OK ){
- return rc;
- }
-
- sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
- sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
- sqlite3_step(pStmt);
-
- return sqlite3_finalize(pStmt);
- }
-
- int queryTokenizer(
- sqlite3 *db,
- char *zName,
- const sqlite3_tokenizer_module **pp
- ){
- int rc;
- sqlite3_stmt *pStmt;
- const char zSql[] = "SELECT fts3_tokenizer(?)";
-
- *pp = 0;
- rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
- if( rc!=SQLITE_OK ){
- return rc;
- }
-
- sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
- if( SQLITE_ROW==sqlite3_step(pStmt) ){
- if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
- memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
- }
- }
-
- return sqlite3_finalize(pStmt);
- }
|