FTS5 min_word_size patch

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

FTS5 min_word_size patch

Domingo Alvarez Duarte
Hello !

After reporting here previously about this issue I've got a working
implementation of "min_word_size" option to Unicode61Tokenizer see patch
bellow.

With it here is the result of a simple test:

====

./sqlite3
SQLite version 3.26.0 2018-09-20 20:43:28
Enter ".help" for usage hints.
Connected to a transient in-memory database.
Use ".open FILENAME" to reopen on a persistent database.
sqlite> create virtual table tfts using fts5(data, tokenize = 'unicode61
min_word_size 3');
sqlite> create virtual table if not exists tfts_vocab_row USING
fts5vocab('tfts', 'row');
sqlite> insert into tfts(data) values('A new way to tokenize using fts5
from sqlite, we can discard n letters word');
sqlite> select * from tfts_vocab_row;
discard|1|1
from|1|1
fts5|1|1
letters|1|1
sqlite|1|1
tokenize|1|1
using|1|1
word|1|1

====

====

fossil diff fts5_tokenize.c
Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -233,10 +233,11 @@
  struct Unicode61Tokenizer {
    unsigned char aTokenChar[128];  /* ASCII range token characters */
    char *aFold;                    /* Buffer to fold text into */
    int nFold;                      /* Size of aFold[] in bytes */
    int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
+  int nMinWordSize;           /* Min size of a word to be indexed */
    int nException;
    int *aiException;

    unsigned char aCategory[32];    /* True for token char categories */
  };
@@ -360,10 +361,11 @@
        const char *zCat = "L* N* Co";
        int i;
        memset(p, 0, sizeof(Unicode61Tokenizer));

        p->bRemoveDiacritic = 1;
+      p->nMinWordSize = 0;
        p->nFold = 64;
        p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
        if( p->aFold==0 ){
          rc = SQLITE_NOMEM;
        }
@@ -393,10 +395,14 @@
          if( 0==sqlite3_stricmp(azArg[i], "separators") ){
            rc = fts5UnicodeAddExceptions(p, zArg, 0);
          }else
          if( 0==sqlite3_stricmp(azArg[i], "categories") ){
            /* no-op */
+        }else
+        if( 0==sqlite3_stricmp(azArg[i], "min_word_size") ){
+          int mwsz;
+          if( sqlite3GetInt32(zArg, &mwsz) ) p->nMinWordSize = mwsz;
          }else{
            rc = SQLITE_ERROR;
          }
        }

@@ -450,10 +456,11 @@
    while( rc==SQLITE_OK ){
      int iCode;                    /* non-ASCII codepoint read from
input */
      char *zOut = aFold;
      int is;
      int ie;
+    int wsz;

      /* Skip any separator characters. */
      while( 1 ){
        if( zCsr>=zTerm ) goto tokenize_done;
        if( *zCsr & 0x80 ) {
@@ -517,12 +524,15 @@
          zCsr++;
        }
        ie = zCsr - (unsigned char*)pText;
      }

+    wsz = zOut-aFold;
+    /* Check min word size */
+    if(p->nMinWordSize && p->nMinWordSize >= wsz) continue;
      /* Invoke the token callback */
-    rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
+    rc = xToken(pCtx, 0, aFold, wsz, is, ie);
    }

   tokenize_done:
    if( rc==SQLITE_DONE ) rc = SQLITE_OK;
    return rc;

====

_______________________________________________
sqlite-users mailing list
[hidden email]
http://mailinglists.sqlite.org/cgi-bin/mailman/listinfo/sqlite-users