| # 2014 Dec 20 |
| # |
| # The author disclaims copyright to this source code. In place of |
| # a legal notice, here is a blessing: |
| # |
| # May you do good and not evil. |
| # May you find forgiveness for yourself and forgive others. |
| # May you share freely, never taking more than you give. |
| # |
| #*********************************************************************** |
| # |
| # Tests focusing on the built-in fts5 tokenizers. |
| # |
| |
| source [file join [file dirname [info script]] fts5_common.tcl] |
| set testprefix fts5tokenizer |
| |
| # If SQLITE_ENABLE_FTS5 is not defined, omit this file. |
| ifcapable !fts5 { |
| finish_test |
| return |
| } |
| |
| |
| do_execsql_test 1.0 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); |
| DROP TABLE ft1; |
| } |
| do_execsql_test 1.1 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter'); |
| DROP TABLE ft1; |
| } |
| do_execsql_test 1.2 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter); |
| DROP TABLE ft1; |
| } |
| do_execsql_test 1.3 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter'); |
| DROP TABLE ft1; |
| } |
| do_execsql_test 1.4 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii'); |
| DROP TABLE ft1; |
| } |
| |
| do_catchsql_test 1.5 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch'); |
| } {1 {no such tokenizer: nosuch}} |
| |
| do_catchsql_test 1.6 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch'); |
| } {1 {error in tokenizer constructor}} |
| |
| do_execsql_test 2.0 { |
| CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); |
| INSERT INTO ft1 VALUES('embedded databases'); |
| } |
| do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1 |
| do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1 |
| do_execsql_test 2.3 { |
| SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' |
| } 1 |
| |
| proc tcl_create {args} { |
| set ::targs $args |
| error "failed" |
| } |
| sqlite3_fts5_create_tokenizer db tcl tcl_create |
| |
| foreach {tn directive expected} { |
| 1 {tokenize='tcl a b c'} {a b c} |
| 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f} |
| 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i} |
| 4 {tokenize = tcl} {} |
| } { |
| do_catchsql_test 3.$tn.1 " |
| CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive) |
| " {1 {error in tokenizer constructor}} |
| do_test 3.$tn.2 { set ::targs } $expected |
| } |
| |
| do_catchsql_test 4.1 { |
| CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc); |
| } {1 {parse error in "tokenize = tcl abc"}} |
| do_catchsql_test 4.2 { |
| CREATE VIRTUAL TABLE ft2 USING fts5(x y) |
| } {1 {unrecognized column option: y}} |
| |
| #------------------------------------------------------------------------- |
| # Test the "separators" and "tokenchars" options a bit. |
| # |
| foreach {tn tokenizer} {1 ascii 2 unicode61} { |
| reset_db |
| set T "$tokenizer tokenchars ',.:' separators 'xyz'" |
| execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" |
| do_execsql_test 5.$tn.1 { |
| INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz'); |
| } |
| foreach {tn2 token res} { |
| 1 abc 1 2 def 1 3 ghi 1 4 jkl {} |
| 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1 |
| 9 vw 1 |
| } { |
| do_execsql_test 5.$tn.2.$tn2 " |
| SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"' |
| " $res |
| } |
| } |
| |
| #------------------------------------------------------------------------- |
| # Miscellaneous tests for the ascii tokenizer. |
| # |
| # 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the |
| # 'separators' option. But unicode61 does not. |
| # |
| # 5.2.*: An option without an argument is an error. |
| # |
| |
| do_test 5.1.1 { |
| execsql " |
| CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`); |
| INSERT INTO a1 VALUES('abc\u1234def'); |
| " |
| execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } |
| } {} |
| |
| do_test 5.1.2 { |
| execsql " |
| CREATE VIRTUAL TABLE a2 USING fts5( |
| x, tokenize=`unicode61 separators '\u1234'`); |
| INSERT INTO a2 VALUES('abc\u1234def'); |
| " |
| execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } |
| } {1} |
| |
| do_catchsql_test 5.2 { |
| CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars'); |
| } {1 {error in tokenizer constructor}} |
| do_catchsql_test 5.3 { |
| CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg'); |
| } {1 {error in tokenizer constructor}} |
| |
| #------------------------------------------------------------------------- |
| # Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE |
| # correctly. |
| # |
| |
| proc test_token_cb {varname token iStart iEnd} { |
| upvar $varname var |
| lappend var $token |
| if {[llength $var]==3} { return "SQLITE_DONE" } |
| return "SQLITE_OK" |
| } |
| |
| proc tokenize {cmd} { |
| set res [list] |
| $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res] |
| set res |
| } |
| sqlite3_fts5_create_function db tokenize tokenize |
| |
| do_execsql_test 6.0 { |
| CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii); |
| INSERT INTO x1 VALUES('q w e r t y'); |
| INSERT INTO x1 VALUES('y t r e w q'); |
| SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r'; |
| } { |
| {q w e} {y t r} |
| } |
| |
| do_execsql_test 6.1 { |
| CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61); |
| INSERT INTO x2 VALUES('q w e r t y'); |
| INSERT INTO x2 VALUES('y t r e w q'); |
| SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r'; |
| } { |
| {q w e} {y t r} |
| } |
| |
| |
| #------------------------------------------------------------------------- |
| # Miscellaneous tests for the unicode tokenizer. |
| # |
| do_catchsql_test 6.1 { |
| CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars'); |
| } {1 {error in tokenizer constructor}} |
| do_catchsql_test 6.2 { |
| CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b'); |
| } {1 {error in tokenizer constructor}} |
| do_catchsql_test 6.3 { |
| CREATE VIRTUAL TABLE a3 USING fts5( |
| x, y, tokenize = 'unicode61 remove_diacritics 3' |
| ); |
| } {1 {error in tokenizer constructor}} |
| do_catchsql_test 6.4 { |
| CREATE VIRTUAL TABLE a3 USING fts5( |
| x, y, tokenize = 'unicode61 remove_diacritics 10' |
| ); |
| } {1 {error in tokenizer constructor}} |
| |
| #------------------------------------------------------------------------- |
| # Porter tokenizer with very large tokens. |
| # |
| set a [string repeat a 100] |
| set b [string repeat b 500] |
| set c [string repeat c 1000] |
| do_execsql_test 7.0 { |
| CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter); |
| INSERT INTO e5 VALUES($a || ' ' || $b); |
| INSERT INTO e5 VALUES($b || ' ' || $c); |
| INSERT INTO e5 VALUES($c || ' ' || $a); |
| } |
| |
| do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 } |
| do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 } |
| do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 } |
| |
| #------------------------------------------------------------------------- |
| # Test the 'separators' option with the unicode61 tokenizer. |
| # |
| do_execsql_test 8.1 { |
| BEGIN; |
| CREATE VIRTUAL TABLE e6 USING fts5(x, |
| tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ); |
| INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); |
| CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); |
| SELECT term FROM e7; |
| ROLLBACK; |
| } { |
| brown dog fox jumped lazy over quick the |
| } |
| |
| do_execsql_test 8.2 [subst { |
| BEGIN; |
| CREATE VIRTUAL TABLE e6 USING fts5(x, |
| tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'" |
| ); |
| INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' |
| || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog' |
| ); |
| INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09'); |
| CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); |
| SELECT term FROM e7; |
| ROLLBACK; |
| }] [subst { |
| brown dog fox jumped lazy over quick the \u0E08 \u0E09 |
| }] |
| |
| # Test that the porter tokenizer correctly passes arguments through to |
| # its parent tokenizer. |
| do_execsql_test 8.3 { |
| BEGIN; |
| CREATE VIRTUAL TABLE e6 USING fts5(x, |
| tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ); |
| INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); |
| CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); |
| SELECT term FROM e7; |
| ROLLBACK; |
| } { |
| brown dog fox jump lazi over quick the |
| } |
| |
| #------------------------------------------------------------------------- |
| # Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer |
| # implementation. |
| # |
| reset_db |
| proc tcl_create {args} { return "tcl_tokenize" } |
| sqlite3_fts5_create_tokenizer db tcl tcl_create |
| set ::flags [list] |
| proc tcl_tokenize {tflags text} { |
| lappend ::flags $tflags |
| foreach {w iStart iEnd} [fts5_tokenize_split $text] { |
| sqlite3_fts5_token $w $iStart $iEnd |
| } |
| } |
| |
| do_execsql_test 9.1.1 { |
| CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl); |
| INSERT INTO t1 VALUES('abc'); |
| INSERT INTO t1 VALUES('xyz'); |
| } {} |
| do_test 9.1.2 { set ::flags } {document document} |
| |
| set ::flags [list] |
| do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc} |
| do_test 9.2.2 { set ::flags } {query} |
| |
| set ::flags [list] |
| do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc} |
| do_test 9.3.2 { set ::flags } {prefixquery} |
| |
| set ::flags [list] |
| do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {} |
| do_test 9.4.2 { set ::flags } {prefixquery} |
| |
| set ::flags [list] |
| do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {} |
| do_test 9.5.2 { set ::flags } {query} |
| |
| #------------------------------------------------------------------------- |
| # |
| reset_db |
| do_execsql_test 10.1 { |
| CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=unicode61); |
| PRAGMA writable_schema = 1; |
| UPDATE sqlite_schema |
| SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize="unicode61 error");' |
| WHERE name = 'x1'; |
| } |
| |
| db close |
| sqlite3 db test.db |
| |
| do_catchsql_test 10.2 { |
| SELECT * FROM x1('abc'); |
| } {1 {error in tokenizer constructor}} |
| |
| do_catchsql_test 10.3 { |
| INSERT INTO x1 VALUES('abc'); |
| } {1 {error in tokenizer constructor}} |
| |
| do_execsql_test 10.4 { |
| PRAGMA writable_schema = 1; |
| UPDATE sqlite_schema |
| SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize="nosuch error");' |
| WHERE name = 'x1'; |
| } |
| |
| db close |
| sqlite3 db test.db |
| |
| do_catchsql_test 10.5 { |
| SELECT * FROM x1('abc'); |
| } {1 {no such tokenizer: nosuch}} |
| do_catchsql_test 10.6 { |
| INSERT INTO x1 VALUES('abc'); |
| } {1 {no such tokenizer: nosuch}} |
| |
| do_execsql_test 10.7 { |
| DROP TABLE x1; |
| SELECT * FROM sqlite_schema; |
| } |
| |
| reset_db |
| do_execsql_test 10.8 { |
| CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=unicode61); |
| INSERT INTO x1 VALUES('a b c'), ('d e f'), ('a b c'); |
| CREATE VIRTUAL TABLE x1v USING fts5vocab(x1, row); |
| |
| PRAGMA writable_schema = 1; |
| UPDATE sqlite_schema |
| SET sql = 'CREATE VIRTUAL TABLE x1 USING fts5(x, tokenize=simplify);' |
| WHERE name = 'x1'; |
| } |
| |
| do_execsql_test 10.9 { |
| SELECT * FROM x1v |
| } { |
| a 2 2 b 2 2 c 2 2 d 1 1 e 1 1 f 1 1 |
| } |
| |
| db close |
| sqlite3 db test.db |
| |
| do_execsql_test 10.10 { |
| SELECT * FROM x1v |
| } { |
| a 2 2 b 2 2 c 2 2 d 1 1 e 1 1 f 1 1 |
| } |
| |
| finish_test |