Full Text Search Example/Tour
You can do FTS5 using normal SQL as documented. This example shows using APSW specific functionality and extras.
#!/usr/bin/env python3 # This code uses Python's optional typing annotations. You can # ignore them and do not need to use them. If you do use them # then you must include this future annotations line first. from__future__import annotations frompprintimport pprint importre importfunctools importapsw importapsw.ext # The three FTS5 specific modules importapsw.fts5 importapsw.fts5aux importapsw.fts5query
Is FTS5 available?
FTS5 is included as part of the SQLite library (usually).
print("FTS5 available:", "ENABLE_FTS5" in apsw.compile_options)
FTS5 available: True
Content setup
The connection to use. The database has a table with recipes.
connection = apsw.Connection("recipes.db") # The content print( connection.execute( "SELECT sql FROM sqlite_schema WHERE name='recipes'" ).get ) print(connection.execute("SELECT COUNT(*) FROM recipes").get)
CREATE TABLE recipes(name, ingredients, description) 173278
Create/open search table
Create a table containing the search index using recipes as an external content table.
if not connection.table_exists("main", "search"): # create does all the hard work search_table = apsw.fts5.Table.create( connection, # The table will be named 'search' "search", # External content table name. It has to be in the same # database. content="recipes", # We want the same columns as recipe, so pass `None`. columns=None, # Triggers ensure that changes to the content table # are reflected in the search table generate_triggers=True, # Use APSW recommended tokenization tokenize=[ # simplify tokenizer "simplify", # removes case distinction "casefold", "true", # removes accents, uses compatibility codepoints "strip", "true", # unicode algorithm for word boundaries tokenizer "unicodewords", ], # There are many more options you can set ) else: # Already exists so just the name is needed search_table = apsw.fts5.Table(connection, "search") # You should use this to get the table name when formatting SQL # queries as they can't use bindings. It will correctly quote the # schema (attached database name) and the table name no matter what # characters, spaces etc are used. print("quoted name", search_table.quoted_table_name)
quoted name "main"."search"
Table structure and statistics
Examine the structure, options, and statistics
pprint(search_table.structure) # rank is one of several options you can read or change print(f"{search_table.config_rank()=}") # some statistics print(f"{search_table.row_count=}") print(f"{search_table.tokens_per_column=}")
FTS5TableStructure(name='search', columns=('name', 'ingredients', 'description'), unindexed=set(), tokenize=('simplify', 'casefold', 'true', 'strip', 'true', 'unicodewords'), prefix=set(), content='recipes', content_rowid='_ROWID_', contentless_delete=None, contentless_unindexed=None, columnsize=True, tokendata=False, locale=False, detail='full') search_table.config_rank()='bm25()' search_table.row_count=173278 search_table.tokens_per_column=[743406, 8064731, 3824644]
Content
Use upsert() to add or change existing
content and delete() to delete a row. They
understand external content tables and will do the operations there,
then the triggers will update the search index.
row_by_id() gets one or more columns from a
row, and also handles external content tables.
# upsert returns the rowid of an insert or change. rowid = search_table.upsert( # you can use positional parameters so this goes to the first column "This ends up in the name field", # and keywords description="This ends up in the description", ) print(f"{search_table.row_by_id(rowid,'name')=}") # modify that row search_table.upsert(ingredients="some stuff", rowid=rowid) # And delete our test row search_table.delete(rowid)
search_table.row_by_id(rowid, 'name')='This ends up in the name field'
Searching with SQL.
You need to specify what should be returned, the FTS query, and order by to get the best results first.
sql = """ SELECT -- snippet takes a lot of options! snippet(search, -1, '►', '◄', '...', 10) as Snippet -- Pass the query like this FROM search(?) -- Best matches first ORDER BY rank -- Top 3 LIMIT 3""" for query in ( "cream", "orange NOT zest", "name:honey", "pomegranate OR olives", ): print(f"{query=}") print(apsw.ext.format_query_table(connection, sql, (query,)))
query='cream' ┌───────────────────────────────────────────────────────────┐ │ Snippet │ ├───────────────────────────────────────────────────────────┤ │ Find the recipe for Sour ►Cream◄ Ice ►Cream◄ and other... │ ├───────────────────────────────────────────────────────────┤ │ 1 (8 ounce) package ►cream◄ cheese │ │ 1 cup white sugar... │ ├───────────────────────────────────────────────────────────┤ │ Want a Ice ►Cream◄ and Party? │ │ Here's a Ice-►Cream◄... │ └───────────────────────────────────────────────────────────┘ query='orange NOT zest' ┌───────────────────────────────────────────────────┐ │ Snippet │ ├───────────────────────────────────────────────────┤ │ ice │ │ 750ml blood ►orange◄ , or ►orange◄ juice │ │ 750ml soda water... │ ├───────────────────────────────────────────────────┤ │ 1 tablespoon olive oil │ │ 4 (4 ounce) fillets ►orange◄ roughy... │ ├───────────────────────────────────────────────────┤ │ ...►orange◄-flavored liqueur │ │ Extra ►orange◄ slices for serving, or ►orange◄... │ └───────────────────────────────────────────────────┘ query='name:honey' ┌────────────────────────────────────────────────────────────────────────┐ │ Snippet │ │ Wildflower-►Honey◄ Semifreddo with ►Honey◄ Sesame Wafers │ │ Roast Chicken with Saffron, Hazelnuts, and ►Honey◄ │ │ Recipe Flash: Ginger-►Honey◄ Chicken WingsGinger-►Honey◄ Chicken Wings │ └────────────────────────────────────────────────────────────────────────┘ query='pomegranate OR olives' ┌─────────────────────────────────────────────────────────────────────────────┐ │ Snippet │ ├─────────────────────────────────────────────────────────────────────────────┤ │ A new way to serve ►olives◄ - marinated in ►pomegranate◄ juice... │ ├─────────────────────────────────────────────────────────────────────────────┤ │ Purple kale, oranges, feta cheese and ►olives◄ topped with ►pomegranate◄... │ ├─────────────────────────────────────────────────────────────────────────────┤ │ ...juice │ │ Seeds from a ►pomegranate◄ │ │ 25g dried apricot pieces │ │ Quarter... │ └─────────────────────────────────────────────────────────────────────────────┘
Search method
search() provides a Pythonic API providing
information about each matching row, best matches first.
for row in search_table.search("lemon OR guava"): # Note how you see overall query info (it is two phrases) and # information about the matched row (how many tokens in each # column), and which columns each phrase was found in pprint(row) # only show the first matching row break # Inspect first matching row name, description = search_table.row_by_id( row.rowid, ("name", "description") ) print((name, description))
MatchInfo(query_info=QueryInfo(phrases=(('lemon',), ('guava',))), rowid=1642796043941632884, column_size=[3, 24, 5], phrase_columns=[[1], [0, 2]]) ("P.L's Guava Jelly", 'Delicious home-made guava jam')
Query correction and suggestion
If the query contains words that don’t exist or are very rare (likely spelling difference) then you can provide alternate queries that probably improve results.
for query in ( "jalapno", # query structure is maintained "orange AND guice", # column names are corrected too "nyme:(minced OR oyl NOT peenut)", # None is returned if all terms are ok "sweet onion", ): suggest = search_table.query_suggest(query) print(f"{query=}{suggest=}")
query='jalapno' suggest='jalapeño' query='orange AND guice' suggest='orange juice' query='nyme:(minced OR oyl NOT peenut)' suggest='name: (minced OR oil NOT peanut)' query='sweet onion' suggest=None
Working with tokens
Document and query text is processed into tokens, with matches found based on those tokens. Tokens are not visible to the user.
Typically they correspond to "words" in the text, but with upper/lower case neutralized, punctuation removed, marks and accents removed.
# See what happens with sample text text = "Don't 2.245e5 Run-Down Déjà 你好世界😂❤️🤣" # tokenization happens on UTF8 utf8 = text.encode() # Note offsets into the utf8. Those could be used to highlight the # original. pprint(search_table.tokenize(utf8)) # For our table, the most popular tokens, and what percent of rows # they are in print("\nMost popular by row count") row_count = search_table.row_count for token, count in search_table.token_doc_frequency(): print(f"{token:20}{count/row_count:.0%}") # Most popular tokens, based on total token count print("\nMost popular by token count") token_count = search_table.token_count for token, count in search_table.token_frequency(): print(f"{token:20}{count/token_count:.0%}") # Find what text produced a token, by looking at 5 rows. token = "jalapeno" text = search_table.text_for_token(token, 5) print(f"\nText for {token=} is {text}")
[(0, 5, 'dont'), (6, 13, '2245e5'), (14, 17, 'run'), (18, 22, 'down'), (23, 29, 'deja'), (30, 33, '你'), (33, 36, '好'), (36, 39, '世'), (39, 42, '界'), (42, 46, '😂'), (46, 52, '❤'), (52, 56, '🤣')] Most popular by row count 1 92% 2 80% and 72% a 59% cup 56% cups 53% 4 49% with 47% 3 47% the 43% Most popular by token count 1 8% 2 4% cups 2% cup 2% and 2% 1⁄2 2% 4 1% teaspoon 1% a 1% 3 1% Text for token='jalapeno' is Jalapeño
Key Tokens and More Like
key_tokens() finds tokens represented in a
row, but rare in other rows. This is purely statistical and has no
idea of the meaning or relationship between tokens.
more_like() is given some rowids, extracts
their key tokens, and starts a search with them, excluding the
rowids already seen. This lets you provide "infinite scrolling"
starting from one or more rows, providing additional similar
content.
Both methods let you specify specific columns, or all columns (default)
# A randomly selected row ... bbq_rowid = 1642796066805404445 # ... for bbq sauce print(search_table.row_by_id(bbq_rowid, "name")) # Note how each token gets a score, with bigger numbers meaning the # token is more unique pprint(search_table.key_tokens(bbq_rowid, columns=["name"], limit=3)) # More like based on the ingredients column for count, match_info in enumerate( search_table.more_like([bbq_rowid], columns="ingredients") ): # Show the name for each print(search_table.row_by_id(match_info.rowid, "name")) # We could save each of these rowids and at the end do another # more_like including them. Stop after a few for now. if count == 5: break
Thick and Sticky Barbecue Sauce Recipe [(0.0002008032128514056, 'sticky'), (0.00015475085112968121, 'barbecue'), (3.077870113881194e-05, 'thick')] Indian Chicken Rub Berbere Sarah's Chili Seasoning Mix Jenga Ribs Indian Spice Baked Chicken Valentine's Day Steak Rub
Autocomplete
You often want to show results after just a few letters have been
typed before there is a complete word entered. This is done by
indexing sequences of a few letters, called ngrams. Ngrams are never shown to the user
although you can see the snippets below.
if not connection.table_exists("main", "autocomplete"): # create does all the hard work autocomplete = apsw.fts5.Table.create( connection, # The table will be named 'search' "autocomplete", # External content table name. It has to be in the same # database. content="recipes", # We want the same columns as recipe, so pass `None`. columns=None, # Triggers ensure that changes to the content table # are reflected in the search table generate_triggers=True, # Use APSW recommended tokenization tokenize=[ # simplify tokenizer "simplify", # removes case distinction "casefold", "true", # removes accents, uses compatibility codepoints "strip", "true", # ngram tokenizer "ngram", # How big is each sequence? This is how many letters have # to be typed before any match is possible. Smaller values # result in larger indexes. "ngrams", "3", ], # There are many more options you can set ) else: # Already exists so just the name is needed autocomplete = apsw.fts5.Table(connection, "autocomplete") # do some queries against autocomplete index sql = """ SELECT -- snippet takes a lot of options! snippet(autocomplete, -1, '►', '◄', '...', 10) as Snippet -- Pass the query like this FROM autocomplete(?) -- Best matches first ORDER BY rank -- Top 3 LIMIT 3""" for query in ( "eam", "ora", "name:ney", "emo jui", "barbecue", ): print(f"{query=}") print(apsw.ext.format_query_table(connection, sql, (query,)))
query='eam' ┌───────────────────┐ │ Snippet │ │ Sour Cr►eam◄ I... │ │ Cr►eam◄less Cr... │ │ Cr►eam◄ cheese... │ └───────────────────┘ query='ora' ┌───────────────────┐ │ Snippet │ │ Blood ►ora◄nge... │ │ ►Ora◄nge cake ... │ │ ►Ora◄nge Deser... │ └───────────────────┘ query='name:ney' ┌──────────────────────┐ │ Snippet │ │ Ho►ney◄ed-Mang... │ │ ...r-Ho►ney◄ Semi... │ │ ...h Ho►ney◄ed-Ma... │ └──────────────────────┘ query='emo jui' ┌────────────────────────┐ │ Snippet │ │ ... L►emo◄n ►Jui◄ce... │ │ ... l►emo◄n ►jui◄ce... │ │ ... l►emo◄n ►jui◄ce... │ └────────────────────────┘ query='barbecue' ┌───────────────────┐ │ Snippet │ │ Classic ►Barb◄... │ │ ►Barbecue◄ Por... │ │ ►Barbecue◄d Ba... │ └───────────────────┘
Auxiliary functions
Auxiliary functions are called for each matching row. They can be used to provide ranking (sorting) for determining better matches, or returning information about the match and row such as highlights, and snippets.
They are called with FTS5ExtensionApi as the first
parameter, and then any function arguments, and return a value.
This example shows all the information available during a query.
defrow_match( api: apsw.FTS5ExtensionApi, *args: apsw.SQLiteValue ) -> apsw.SQLiteValue: print("row_match called with", args) # Show what information is available from the api print(f"{api.rowid=}") print(f"{api.row_count=}") print(f"{api.column_count=}") for col in range(api.column_count): print(f" {col=}{api.column_size(col)=}") print(f" {col=}{api.column_total_size(col)=}") print(f"{api.inst_count=}") for inst in range(api.inst_count): print(f" {inst=}{api.inst_tokens(inst)=}") print(f"{api.phrases=}") for phrase in range(len(api.phrases)): # which columns the phrase is found in print(f" {phrase=}{api.phrase_columns(phrase)=}") # which token numbers print(f" {phrase=}{api.phrase_locations(phrase)=}") # the offsets of phrase 3 in column 2 print(f"{api.phrase_column_offsets(3,2)=}") # note the text is the utf-8 encoded bytes print(f"{api.column_text(0)=}") # we can get a tokenization of text, useful if you want to extract # the original text, add snippets/highlights etc print("Tokenized with UTF-8 offsets") pprint(api.tokenize(api.column_text(2), api.column_locale(2))) # query_phrase is useful for finding out how common a phrase is. counts = [0] * len(api.phrases) for phrase in range(len(api.phrases)): api.query_phrase(phrase, phrase_count, (phrase, counts)) for i, phrase in enumerate(api.phrases): print(f"Phrase {phrase=} occurs {counts[i]:,} times") return 7 # This is used as the callback from query_phrase above. Note that the # api instance in this call is different than the above function. defphrase_count(api: apsw.FTS5ExtensionApi, closure): phrase, counts = closure # increment counts for this phrase counts[phrase] += 1 if counts[phrase] < 5: # Show call info the first 4 times for each phrase print(f"phrase_count called {api.rowid=}{api.phrases=}") # we could do more sophisticated work such as counting how many # times it occurs (api.phrase_locations) or which columns # (api.phrase_columns). connection.register_fts5_function("row_match", row_match) # A deliberately complex query to make the api interesting query = ( """("BoiLed eGGs" OR CinnaMON) OR NEAR (drink Cup, 5) NOT Oran*""" ) # Make all the code above be called. Note how the table name has to be # the first parameter to our function in the SQL connection.execute( "SELECT row_match(search, 5, 'hello') FROM search(?) order by rank", (query,), )
row_match called with (5, 'hello') api.rowid=1642795772696450313 api.row_count=173279 api.column_count=3 col=0 api.column_size(col)=3 col=0 api.column_total_size(col)=743413 col=1 api.column_size(col)=4 col=1 api.column_total_size(col)=8064731 col=2 api.column_size(col)=9 col=2 api.column_total_size(col)=3824650 api.inst_count=2 inst=0 api.inst_tokens(inst)=('boiled', 'eggs') inst=1 api.inst_tokens(inst)=('boiled', 'eggs') api.phrases=(('boiled', 'eggs'), ('cinnamon',), ('drink',), ('cup',), ('oran',)) phrase=0 api.phrase_columns(phrase)=(0, 2) phrase=0 api.phrase_locations(phrase)=[[1], [], [7]] phrase=1 api.phrase_columns(phrase)=() phrase=1 api.phrase_locations(phrase)=[[], [], []] phrase=2 api.phrase_columns(phrase)=() phrase=2 api.phrase_locations(phrase)=[[], [], []] phrase=3 api.phrase_columns(phrase)=() phrase=3 api.phrase_locations(phrase)=[[], [], []] phrase=4 api.phrase_columns(phrase)=() phrase=4 api.phrase_locations(phrase)=[[], [], []] api.phrase_column_offsets(3, 2)=[] api.column_text(0)=b'Hard Boiled Eggs' Tokenized with UTF-8 offsets [(0, 3, 'the'), (4, 9, 'right'), (10, 13, 'way'), (14, 16, 'to'), (17, 21, 'cook'), (22, 29, 'perfect'), (30, 34, 'hard'), (35, 41, 'boiled'), (42, 46, 'eggs')] phrase_count called api.rowid=1642795733280177125 api.phrases=(('boiled', 'eggs'),) phrase_count called api.rowid=1642795734218860660 api.phrases=(('boiled', 'eggs'),) phrase_count called api.rowid=1642795742364372196 api.phrases=(('boiled', 'eggs'),) phrase_count called api.rowid=1642795748917697308 api.phrases=(('boiled', 'eggs'),) phrase_count called api.rowid=1642795722012755793 api.phrases=(('cinnamon',),) phrase_count called api.rowid=1642795722012755797 api.phrases=(('cinnamon',),) phrase_count called api.rowid=1642795722012755806 api.phrases=(('cinnamon',),) phrase_count called api.rowid=1642795722012755828 api.phrases=(('cinnamon',),) phrase_count called api.rowid=1642795722012755795 api.phrases=(('drink',),) phrase_count called api.rowid=1642795722012819365 api.phrases=(('drink',),) phrase_count called api.rowid=1642795725826572496 api.phrases=(('drink',),) phrase_count called api.rowid=1642795725826588872 api.phrases=(('drink',),) phrase_count called api.rowid=1642795721317011550 api.phrases=(('cup',),) phrase_count called api.rowid=1642795721317011555 api.phrases=(('cup',),) phrase_count called api.rowid=1642795721369682815 api.phrases=(('cup',),) phrase_count called api.rowid=1642795721369683223 api.phrases=(('cup',),) phrase_count called api.rowid=1642795722012755799 api.phrases=(('oran',),) phrase_count called api.rowid=1642795722012755999 api.phrases=(('oran',),) phrase_count called api.rowid=1642795722012757203 api.phrases=(('oran',),) phrase_count called api.rowid=1642795722012819341 api.phrases=(('oran',),) Phrase phrase=('boiled', 'eggs') occurs 321 times Phrase phrase=('cinnamon',) occurs 12,000 times Phrase phrase=('drink',) occurs 1,925 times Phrase phrase=('cup',) occurs 96,655 times Phrase phrase=('oran',) occurs 8,181 times
Query parsing and manipulation
apsw.fts5query lets you programmatically create, update, and
parse queries. There are three forms of query.
# This is the query as accepted by FTS5. print("query") print(query) # That can be parsed into the structure parsed = apsw.fts5query.parse_query_string(query) print("\nparsed") pprint(parsed) # The parsed form is a little unwieldy to work with so a dict based # form is available. as_dict = apsw.fts5query.to_dict(parsed) print("\nas_dict") pprint(as_dict) # Make some changes - delete the first query del as_dict["queries"][0] as_dict["queries"].append( { # add a columnfilter "@": "COLUMNFILTER", "filter": "include", "columns": ["name", "description"], # The sub queries are just strings. The module knows what # you mean and will convert them into AND "query": ["some thing blue", "sunshine"], } ) print("\nmodified as_dict") pprint(as_dict) # Turn it into parsed form parsed = apsw.fts5query.from_dict(as_dict) print("\nnew parsed") pprint(parsed) # Turn the parsed form back into a query string query = apsw.fts5query.to_query_string(parsed) print("\nnew query") print(query)
query ("BoiLed eGGs" OR CinnaMON) OR NEAR (drink Cup, 5) NOT Oran* parsed OR(queries=[OR(queries=[PHRASE(phrase='BoiLed eGGs', initial=False, prefix=False, plus=None), PHRASE(phrase='CinnaMON', initial=False, prefix=False, plus=None)]), NOT(match=NEAR(phrases=[PHRASE(phrase='drink', initial=False, prefix=False, plus=None), PHRASE(phrase='Cup', initial=False, prefix=False, plus=None)], distance=5), no_match=PHRASE(phrase='Oran', initial=False, prefix=True, plus=None))]) as_dict {'@': 'OR', 'queries': [{'@': 'OR', 'queries': [{'@': 'PHRASE', 'phrase': 'BoiLed eGGs'}, {'@': 'PHRASE', 'phrase': 'CinnaMON'}]}, {'@': 'NOT', 'match': {'@': 'NEAR', 'distance': 5, 'phrases': [{'@': 'PHRASE', 'phrase': 'drink'}, {'@': 'PHRASE', 'phrase': 'Cup'}]}, 'no_match': {'@': 'PHRASE', 'phrase': 'Oran', 'prefix': True}}]} modified as_dict {'@': 'OR', 'queries': [{'@': 'NOT', 'match': {'@': 'NEAR', 'distance': 5, 'phrases': [{'@': 'PHRASE', 'phrase': 'drink'}, {'@': 'PHRASE', 'phrase': 'Cup'}]}, 'no_match': {'@': 'PHRASE', 'phrase': 'Oran', 'prefix': True}}, {'@': 'COLUMNFILTER', 'columns': ['name', 'description'], 'filter': 'include', 'query': ['some thing blue', 'sunshine']}]} new parsed OR(queries=[NOT(match=NEAR(phrases=[PHRASE(phrase='drink', initial=False, prefix=False, plus=None), PHRASE(phrase='Cup', initial=False, prefix=False, plus=None)], distance=5), no_match=PHRASE(phrase='Oran', initial=False, prefix=True, plus=None)), COLUMNFILTER(columns=['name', 'description'], filter='include', query=AND(queries=[PHRASE(phrase='some thing blue', initial=False, prefix=False, plus=None), PHRASE(phrase='sunshine', initial=False, prefix=False, plus=None)]))]) new query NEAR(drink Cup, 5) NOT Oran* OR {name description}: ("some thing blue" sunshine)
Tokenizers
Tokenizers convert
text into the tokens used to find matching rows. They work on UTF8
input providing the beginning and end offsets for each token. They
can also provide more than one token at the same position for example if you
wanted both first and 1st.
Tokenizers and their arguments are specified as the ‘tokenize’
option when creating a FTS5 table. You can also call them directly
from a connection. APSW
provides several tokenizers but lets look at
unicode61 -
the default SQLite tokenizer
tokenizer = connection.fts5_tokenizer("unicode61") test_text = """🤦🏼♂️ v1.2 Grey III ColOUR! Don't jump - 🇫🇮你好世界 Straße हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR""" # Call the tokenizer to do a tokenization, supplying the reason # and no locale. pprint( tokenizer( test_text.encode("utf8"), apsw.FTS5_TOKENIZE_DOCUMENT, None ) ) # Make a function to show output defshow_tokens( text, tokenizer_name, tokenizer_args=None, reason=apsw.FTS5_TOKENIZE_DOCUMENT, ): print(f"\n{text=:s}") print(f"{tokenizer_name=}{tokenizer_args=}") tokenizer = connection.fts5_tokenizer( tokenizer_name, tokenizer_args ) # exclude the offsets since they clutter the output pprint( tokenizer( text.encode("utf8"), reason, None, include_offsets=False, ) ) print() show_tokens("v1.2 SQLITE_ERROR", "unicode61") # We want the version number and symbol kept together, so use # the tokenchars parameter. Many tokenizers take parameters. show_tokens("v1.2 SQLITE_ERROR", "unicode61", ["tokenchars", "_."]) # Tokenizers can also be chained together. The porter tokenizer takes # existing tokens and turns them into a base. The rightmost tokenizer # generates tokens, while ones to the left transform them. This ensures # you can search for variations of words without having to get them # exactly right. show_tokens( "Likes liked likely liking cat cats colour color", "porter", ["unicode61", "tokenchars", "_"], )
[(0, 8, '🤦🏼'), (18, 20, 'v1'), (21, 22, '2'), (23, 27, 'grey'), (28, 31, 'iii'), (32, 38, 'colour'), (40, 43, 'don'), (44, 45, 't'), (46, 50, 'jump'), (62, 74, '你好世界'), (75, 82, 'straße'), (87, 90, 'ह'), (93, 96, 'ल'), (100, 106, 'वर'), (109, 112, 'ल'), (115, 118, 'ड'), (119, 125, 'deja'), (126, 128, 'vu'), (129, 137, 'resume'), (138, 144, 'sqlite'), (145, 150, 'error')] text=v1.2 SQLITE_ERROR tokenizer_name='unicode61' tokenizer_args=None [('v1',), ('2',), ('sqlite',), ('error',)] text=v1.2 SQLITE_ERROR tokenizer_name='unicode61' tokenizer_args=['tokenchars', '_.'] [('v1.2',), ('sqlite_error',)] text=Likes liked likely liking cat cats colour color tokenizer_name='porter' tokenizer_args=['unicode61', 'tokenchars', '_'] [('like',), ('like',), ('like',), ('like',), ('cat',), ('cat',), ('colour',), ('color',)]
apsw.fts5.UnicodeWordsTokenizer
apsw.fts5.UnicodeWordsTokenizer() does word segmentation using
the Unicode algorithm TR29 which
works well across languages. It understands when punctuation is part
of words like in don't, that numbers include punctuation, as
well as emoji and regional indicators
connection.register_fts5_tokenizer( "unicodewords", apsw.fts5.UnicodeWordsTokenizer ) # unicode61 doesn't understand grapheme clusters or # punctuation in words, or other languages show_tokens(test_text, "unicode61") # unicodewords has you covered show_tokens(test_text, "unicodewords")
text=🤦🏼♂️ v1.2 Grey III ColOUR! Don't jump - 🇫🇮你好世界 Straße हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR tokenizer_name='unicode61' tokenizer_args=None [('🤦🏼',), ('v1',), ('2',), ('grey',), ('iii',), ('colour',), ('don',), ('t',), ('jump',), ('你好世界',), ('straße',), ('ह',), ('ल',), ('वर',), ('ल',), ('ड',), ('deja',), ('vu',), ('resume',), ('sqlite',), ('error',)] text=🤦🏼♂️ v1.2 Grey III ColOUR! Don't jump - 🇫🇮你好世界 Straße हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR tokenizer_name='unicodewords' tokenizer_args=None [('🤦🏼\u200d♂️',), ('v1.2',), ('Grey',), ('III',), ('ColOUR',), ("Don't",), ('jump',), ('🇫🇮',), ('你',), ('好',), ('世',), ('界',), ('Straße',), ('हैलो',), ('वर्ल्ड',), ('Déjà',), ('vu',), ('Résumé',), ('SQLITE_ERROR',)]
apsw.fts5.SimplifyTokenizer
You may have noticed that there are accents (diacritics) and
mixed case in the tokens in the example above. It is
convenient to remove those. The apsw.fts5.SimplifyTokenizer()
can neutralize case and remove accents and marks, so you can use it
to filter your own or other tokenizers.
connection.register_fts5_tokenizer( "simplify", apsw.fts5.SimplifyTokenizer ) show_tokens( test_text, "simplify", [ # casefold is for case insensitive comparisons "casefold", "1", # strip decomposes codepoints to remove accents # and marks, and uses compatibility codepoints, # an example is Roman numeral III becomes III, "strip", "1", # Use unicodewords to get the tokens to simplify "unicodewords", ], )
text=🤦🏼♂️ v1.2 Grey III ColOUR! Don't jump - 🇫🇮你好世界 Straße हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR tokenizer_name='simplify' tokenizer_args=['casefold', '1', 'strip', '1', 'unicodewords'] [('🤦♂',), ('v12',), ('grey',), ('iii',), ('colour',), ('dont',), ('jump',), ('🇫🇮',), ('你',), ('好',), ('世',), ('界',), ('strasse',), ('हल',), ('वरलड',), ('deja',), ('vu',), ('resume',), ('sqliteerror',)]
Your own tokenizer
We will define our own tokenizer to be the same as above, but without all those parameters in the table definition. A tokenizer takes the connection and list of string parameters.
defmy_tokenizer( con: apsw.Connection, params: list[str] ) -> apsw.FTS5Tokenizer: # we take no params if params: raise ValueError("Expected no parameters") # Same as above, but no longer in our SQL return con.fts5_tokenizer( "simplify", ["casefold", "1", "strip", "1", "unicodewords"], ) connection.register_fts5_tokenizer("mine", my_tokenizer) # Produces same result as above show_tokens(test_text, "mine")
text=🤦🏼♂️ v1.2 Grey III ColOUR! Don't jump - 🇫🇮你好世界 Straße हैलो वर्ल्ड Déjà vu Résumé SQLITE_ERROR tokenizer_name='mine' tokenizer_args=None [('🤦♂',), ('v12',), ('grey',), ('iii',), ('colour',), ('dont',), ('jump',), ('🇫🇮',), ('你',), ('好',), ('世',), ('界',), ('strasse',), ('हल',), ('वरलड',), ('deja',), ('vu',), ('resume',), ('sqliteerror',)]
Your own tokenizer, part 2
We’ll make one entirely our own, not building on any existing
tokenizer. Tokenizers operate on UTF8 and byte offsets. The
apsw.fts5.StringTokenizer() decorator lets you operate on
str instead and handles the mapping.
apsw.fts5.parse_tokenizer_args() makes it easy to handle
parameters.
@apsw.fts5.StringTokenizer defatokenizer( con: apsw.Connection, params: list[str] ) -> apsw.FTS5Tokenizer: # What we accept ta = apsw.fts5.TokenizerArgument spec = { # two choices "big": ta(choices=("ship", "plane")), # default value only "small": "hello", # conversion "block": ta(default=2, convertor=int), } options = apsw.fts5.parse_tokenizer_args(spec, con, params) # show what options we got print(f"{options=}") deftokenize(text: str, reason: int, locale: str | None): # See apsw.fts5.tokenize_reasons for mapping from text to number print(f"{reason=}") # if a locale table and value was used print(f"{locale=}") # break string in groups of 'block' characters for start in range(0, len(text), options["block"]): token = text[start : start + options["block"]] yield start, start + len(token), token return tokenize connection.register_fts5_tokenizer("atokenizer", atokenizer) # show full return - note offsets are utf8 bytes tok = connection.fts5_tokenizer( "atokenizer", ["big", "plane", "block", "5"] ) pprint(tok(test_text.encode("utf8"), apsw.FTS5_TOKENIZE_AUX, None))
options={'big': 'plane', 'block': 5, 'small': 'hello'} reason=8 locale=None [(0, 17, '🤦🏼\u200d♂️'), (17, 22, ' v1.2'), (22, 27, ' Grey'), (27, 34, ' III Co'), (34, 39, 'lOUR!'), (39, 44, " Don'"), (44, 49, 't jum'), (49, 54, 'p - '), (54, 71, '🇫🇮你好世'), (71, 78, '界 Str'), (78, 84, 'aße\n '), (84, 93, ' है'), (93, 106, 'लो वर'), (106, 119, '्ल्ड '), (119, 126, 'Déjà '), (126, 132, 'vu Ré'), (132, 138, 'sumé '), (138, 143, 'SQLIT'), (143, 148, 'E_ERR'), (148, 150, 'OR')]
apsw.fts5.RegexTokenizer
We can use regular expressions. Unlike the other
tokenizers the pattern is not passed as a SQL level parameter
because there would be a confusing amount of backslashes, square
brackets and other quoting going on.
pattern = r"\d+" # digits flags = re.ASCII # only ascii recognised tokenizer = functools.partial( apsw.fts5.RegexTokenizer, pattern=pattern, flags=flags ) connection.register_fts5_tokenizer("my_regex", tokenizer) # ASCII/Arabic and non-ascii digits text = "text2abc 3.14 tamil ௦௧௨௩௪ bengali ০১২৩৪ arabic01234" show_tokens(text, "my_regex")
text=text2abc 3.14 tamil ௦௧௨௩௪ bengali ০১২৩৪ arabic01234 tokenizer_name='my_regex' tokenizer_args=None [('2',), ('3',), ('14',), ('01234',)]
apsw.fts5.RegexPreTokenizer
Use regular expressions to extract tokens of interest such as identifiers, and then use a different tokenizer on the text between the regular expression matches. Contrast to RegexTokenizer above which ignores text not matching the pattern.
# For this example our identifiers are two digits slash two letters text = "73/RS is larger than 65/ST" # See what unicodewords does show_tokens(text, "unicodewords") # Setup RegexPreTokenizer pattern = r"[0-9][0-9]/[A-Z][A-Z]" tokenizer = functools.partial( apsw.fts5.RegexPreTokenizer, pattern=pattern ) connection.register_fts5_tokenizer("myids", tokenizer) # extract myids, leaving the other text to unicodewords show_tokens(text, "myids", ["unicodewords"])
text=73/RS is larger than 65/ST tokenizer_name='unicodewords' tokenizer_args=None [('73',), ('RS',), ('is',), ('larger',), ('than',), ('65',), ('ST',)] text=73/RS is larger than 65/ST tokenizer_name='myids' tokenizer_args=['unicodewords'] [('73/RS',), ('is',), ('larger',), ('than',), ('65/ST',)]
HTML tokenizer
The HTMLTokenizer() will pass on the
extracted text to another tokenizer.
text = """<title>Big&Small</title><p>Hello <b>world</b>. BÀa""" show_tokens(text, "html", ["unicodewords"])
text=<title>Big&Small</title><p>Hello <b>world</b>. BÀa tokenizer_name='html' tokenizer_args=['unicodewords'] [('Big',), ('Small',), ('Hello',), ('world',), ('BÀa',)]
JSON tokenizer
JSONTokenizer() extracts strings from JSON for
processing by another tokenizer.
importjson data = { "name": "A car🚗", "items": ["one", 2, "three", {"four": "five"}], } text = json.dumps(data, indent=True) # Keys can be extracted show_tokens(text, "json", ["include_keys", "1", "unicodewords"]) # or ignored show_tokens(text, "json", ["include_keys", "0", "unicodewords"])
text={ "name": "A car\ud83d\ude97", "items": [ "one", 2, "three", { "four": "five" } ] } tokenizer_name='json' tokenizer_args=['include_keys', '1', 'unicodewords'] [('name',), ('A',), ('car',), ('🚗',), ('items',), ('one',), ('three',), ('four',), ('five',)] text={ "name": "A car\ud83d\ude97", "items": [ "one", 2, "three", { "four": "five" } ] } tokenizer_name='json' tokenizer_args=['include_keys', '0', 'unicodewords'] [('A',), ('car',), ('🚗',), ('one',), ('three',), ('five',)]
Synonym tokenizer
SynonymTokenizer() is useful to add colocated
tokens.
text = "one Colour first big dog" # We use a dict synonyms = { "colour": "color", "first": "1st", "dog": ["puppy", "canine", "k9"], } tokenizer = apsw.fts5.SynonymTokenizer(synonyms.get) connection.register_fts5_tokenizer( "synonyms", tokenizer, ) # It is to the left of simplify so we don't have to match all the # different cases. By default the synonyms tokenizer only applies when # tokenizing queries. show_tokens( text, "synonyms", ["simplify", "casefold", "1", "unicodewords"], reason=apsw.FTS5_TOKENIZE_QUERY, )
text=one Colour first big dog tokenizer_name='synonyms' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords'] [('one',), ('colour', 'color'), ('first', '1st'), ('big',), ('dog', 'puppy', 'canine', 'k9')]
Stopwords tokenizer
StopWordsTokenizer() removes tokens from the
stream. Tokens that appear in almost every row aren’t useful for
finding good matches, and increase the size of the index.
@apsw.fts5.StopWordsTokenizer defignore(token: str) -> bool: # Return True if the token should be ignored. These are common # English tokens that appear in almost all rows of English # content. return token in { "to", "the", "and", "for", "you", "of", "on", "is", } text = "On the green hills and blue skies you see forever" connection.register_fts5_tokenizer("ignore", ignore) # It is to the left of simplify so we don't have to match all the # different cases. show_tokens( text, "ignore", ["simplify", "casefold", "1", "unicodewords"] )
text=On the green hills and blue skies you see forever tokenizer_name='ignore' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords'] [('green',), ('hills',), ('blue',), ('skies',), ('see',), ('forever',)]
Transform tokenizer
TransformTokenizer() modifies tokens.
@apsw.fts5.TransformTokenizer deftransform(token: str) -> str: # we remove trailing 's' and 'ing' if token.endswith("s"): return token[:-1] if token.endswith("ing"): return token[:-3] return token text = "Chickens playing towards talking plays talks" connection.register_fts5_tokenizer("transform", transform) # It is to the left of simplify so we don't have to match all the # different cases. show_tokens( text, "transform", ["simplify", "casefold", "1", "unicodewords"] )
text=Chickens playing towards talking plays talks tokenizer_name='transform' tokenizer_args=['simplify', 'casefold', '1', 'unicodewords'] [('chicken',), ('play',), ('toward',), ('talk',), ('play',), ('talk',)]
Cleanup
We can now close the connection, but it is optional.
connection.close()