Commit 504460e

authored

Zero index (#6)

* Beta release (0-index implementation) * Added basic summary description for dictdb collection

1 parent 6f71152 commit 504460eCopy full SHA for 504460e

File tree

12 files changed

+152

-59

lines changed

docs/src
- index.md
extras
- examples.jl
- py_benchmarks.py
src
test

12 files changed

+152

-59

lines changed

`‎docs/src/index.md‎`

Lines changed: 6 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -16,13 +16,16 @@ This package is be particulary useful for natural language processing tasks whic`
`16`	`16`	`- [X] Support for unicodes`
`17`	`17`	`- [ ] Custom user defined feature generation methods`
`18`	`18`	`- [ ] Mecab-based tokenizer support`
	`19`	`+- [ ] Support for building databases directly from text files`
	`20`	`+- [ ] Support for persistent databases`
`19`	`21`
`20`	`22`	`## Suported String Similarity Measures`
`21`	`23`
`22`	`24`	`- [X] Dice coefficient`
`23`	`25`	`- [X] Jaccard coefficient`
`24`	`26`	`- [X] Cosine coefficient`
`25`	`27`	`- [X] Overlap coefficient`
	`28`	`+- [X] Exact match`
`26`	`29`
`27`	`30`	`## Installation`
`28`	`31`
`@@ -67,7 +70,9 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)`
`67`	`70`	`# ("foo", 1.0)`
`68`	`71`	`# ("fooo", 0.8888888888888888)`
`69`	`72`
`70`		`-`
	`73`	`+# Describe a working database collection`
	`74`	`+desc = describe_collection(db)`
	`75`	`+# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)`
`71`	`76`	```
`72`	`77`
`73`	`78`	`## TODO: Benchmarks`

`‎extras/examples.jl‎`

Lines changed: 5 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,14 +21,16 @@ push!(db, "foo");`
`21`	`21`	`push!(db, "bar");`
`22`	`22`	`push!(db, "fooo");`
`23`	`23`
`24`		`-f(x, c, s) = search(x, c, s)`
	`24`	`+f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)`
`25`	`25`	`test = "foo";`
`26`	`26`	`col = db;`
`27`	`27`	`sim = Cosine();`
	`28`	`+a = 0.8;`
	`29`	`+r = true;`
`28`	`30`
`29`		`-f(Cosine(), db, "foo")`
	`31`	`+f(Cosine(), db, "foo", 0.8, true)`
`30`	`32`
`31`		`-@btime f($sim, $col, $test)`
	`33`	`+@btime f($sim, $col, $test, $a, $r)`
`32`	`34`	`@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)`
`33`	`35`
`34`	`36`

`‎extras/py_benchmarks.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -13,4 +13,4 @@ def f(x):`
`13`	`13`	`for i in x:`
`14`	`14`	`db.add(i)`
`15`	`15`
`16`		`-# %time f(fake_names)`
	`16`	`+# %time f(fake_names)`

`‎src/SimString.jl‎`

Lines changed: 5 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,9 +2,9 @@ module SimString`
`2`	`2`
`3`	`3`	`import Base: push!, append!`
`4`	`4`	`using DataStructures: DefaultOrderedDict, DefaultDict`
`5`		`-#using ProgressMeter`
`6`		`-#using CircularArrays`
`7`		`-#using OffsetArrays`
	`5`	`+using ProgressMeter`
	`6`	`+using CircularArrays`
	`7`	`+using OffsetArrays`
`8`	`8`
`9`	`9`	`######### Import modules & utils ################`
`10`	`10`	`include("db_collection.jl")`
`@@ -16,8 +16,8 @@ include("search.jl")`
`16`	`16`
`17`	`17`
`18`	`18`	`####### Global export of user API #######`
`19`		`-export Dice, Jaccard, Cosine, Overlap,`
`20`		`- AbstractSimStringDB, DictDB,`
	`19`	`+export Dice, Jaccard, Cosine, Overlap, ExactMatch,`
	`20`	`+ AbstractSimStringDB, DictDB, describe_collection,`
`21`	`21`	`CharacterNGrams, WordNGrams,`
`22`	`22`	`search`
`23`	`23`

`‎src/dictdb.jl‎`

Lines changed: 32 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -75,28 +75,49 @@ end`
`75`	`75`
`76`	`76`	`################################## DictDB UTIL Functions ############################`
`77`	`77`	`"""`
`78`		`-Internal function for retrieving existing features by size`
	`78`	`+ describe_collection(db::DictDB)`
	`79`	`+`
	`80`	`+Basic summary stats for the DB`
	`81`	`+`
	`82`	`+# Arguments`
	`83`	+* `db`: DictDB object
	`84`	`+`
	`85`	`+# Example`
	`86`	+```julia
	`87`	`+db = DictDB(CharacterNGrams(2, " "));`
	`88`	`+append!(db, ["foo", "bar", "fooo"]);`
	`89`	`+describe_collection(db)`
	`90`	`+`
	`91`	`+# Returns`
	`92`	`+* NamedTuples: Summary stats for the DB`
	`93`	+```
	`94`	`+`
`79`	`95`	`"""`
`80`		`-function retrieve_existing_feature_by_size(db::DictDB, size, feature)`
`81`		`- return db.string_feature_map[size][feature]`
`82`		`-end`
	`96`	`+function describe_collection(db::DictDB)`
`83`	`97`
	`98`	`+# Total number of strings in collection`
	`99`	`+∑ = length(db.string_collection)`
`84`	`100`
`85`		`-# """`
`86`		`-# Basic summary stats for the DB`
`87`		`-# """`
`88`		`-# function describe_db(db::DictDB)`
	`101`	`+# Average number of ngram features`
	`102`	`+n = [x for x in keys(db.string_size_map)]`
	`103`	`+μ = sum(n) / length(n)`
`89`	`104`
`90`		`-# end`
	`105`	`+# Total number of ngram features`
	`106`	`+total_ngrams = 0`
	`107`	`+for i in values(db.string_feature_map)`
	`108`	`+ total_ngrams += length(i)`
	`109`	`+end`
	`110`	`+`
	`111`	`+return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)`
	`112`	`+end`
`91`	`113`
`92`	`114`
`93`	`115`	`"""`
`94`	`116`	`Internal function to lookup feature sets by size and feature`
`95`	`117`	`"""`
`96`	`118`	`function lookup_feature_set_by_size_feature(db::DictDB, size, feature)`
`97`		`- # TODO: Clean this up and make it more efficient. Shouldn't updated db.string_feature_map`
`98`	`119`	`if feature ∉ keys(db.lookup_cache[size])`
`99`		`- db.lookup_cache[size][feature] = retrieve_existing_feature_by_size(db, size, feature)`
	`120`	`+ db.lookup_cache[size][feature] = get(db.string_feature_map[size], feature, Set{String}())`
`100`	`121`	`end`
`101`	`122`	`return db.lookup_cache[size][feature]`
`102`	`123`	`end`

`‎src/features.jl‎`

Lines changed: 11 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,6 @@ end`
`41`	`41`	`Internal function to create character-level ngrams features from an AbstractString`
`42`	`42`	`"""`
`43`	`43`	`function n_grams(extractor::CharacterNGrams, x, n)`
`44`		`- # Return counted n-grams (including duplicates)`
`45`	`44`	`return cummulative_ngram_count(init_ngrams(extractor, x, n))`
`46`	`45`	`end`
`47`	`46`
`@@ -54,13 +53,21 @@ function n_grams(extractor::WordNGrams, x, n)`
`54`	`53`	`end`
`55`	`54`
`56`	`55`
	`56`	`+"""`
	`57`	`+Internal function to make zero indexed circular arrays`
	`58`	`+"""`
	`59`	`+function make_zero_index_circular_array(x)`
	`60`	`+ return CircularArray(OffsetArray(x, 0:length(x)-1))`
	`61`	`+end`
	`62`	`+`
	`63`	`+`
`57`	`64`	`"""`
`58`	`65`	`Internal function to generate character-level ngrams features from an AbstractString`
`59`	`66`	`"""`
`60`	`67`	`function extract_features(extractor::CharacterNGrams, str)`
`61`	`68`	`n = extractor.n - 1 == 0 ? 1 : extractor.n - 1`
`62`	`69`	`str = pad_string(str, repeat(extractor.padder, n))`
`63`		`- return n_grams(extractor, str, extractor.n)`
	`70`	`+ return make_zero_index_circular_array(n_grams(extractor, str, extractor.n))`
`64`	`71`	`end`
`65`	`72`
`66`	`73`
`@@ -70,7 +77,7 @@ Internal function to generate word-level ngrams features from an AbstractString`
`70`	`77`	`function extract_features(extractor::WordNGrams, str)`
`71`	`78`	`words_split = split(str, extractor.splitter)`
`72`	`79`	`padded_words = pad_string(words_split, extractor.padder)`
`73`		`- return n_grams(extractor, padded_words, extractor.n)`
	`80`	`+ return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))`
`74`	`81`	`end`
`75`	`82`
`76`	`83`
`@@ -80,16 +87,14 @@ Internal function to count and pad generated character-level ngrams (including d`
`80`	`87`	`function cummulative_ngram_count(x)`
`81`	`88`	`counter = Dict{eltype(x), Int}()`
`82`	`89`
`83`		`- unique_list = map(x) do val`
	`90`	`+ return map(x) do val`
`84`	`91`	`if val in keys(counter)`
`85`	`92`	`counter[val] += 1`
`86`	`93`	`else`
`87`	`94`	`counter[val] = 1`
`88`	`95`	`end`
`89`	`96`	`(val, counter[val])`
`90`	`97`	`end`
`91`		`-`
`92`		`- return unique_list`
`93`	`98`	`end`
`94`	`99`
`95`	`100`

`‎src/measures.jl‎`

Lines changed: 36 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,11 @@ Overlap Similarity Measure.`
`30`	`30`	`struct Overlap <: AbstractSimilarityMeasure end`
`31`	`31`
`32`	`32`
	`33`	`+"""`
	`34`	`+Exact Match Similarity Measure.`
	`35`	`+"""`
	`36`	`+struct ExactMatch <: AbstractSimilarityMeasure end`
	`37`	`+`
`33`	`38`
`34`	`39`	`############## Minimum Feature Sizes Per Measure ##############`
`35`	`40`	`"""`
`@@ -64,8 +69,15 @@ function minimum_feature_size(measure::Overlap, query_size, α)`
`64`	`69`	`end`
`65`	`70`
`66`	`71`
	`72`	`+"""`
	`73`	`+Calculate minimum feature size for ExactMatch similarity measure.`
	`74`	`+"""`
	`75`	`+function minimum_feature_size(measure::ExactMatch, query_size, α)`
	`76`	`+ return query_size`
	`77`	`+end`
`67`	`78`	`############## Maximum Feature Size Per Measure ##############`
`68`	`79`
	`80`	`+`
`69`	`81`	`"""`
`70`	`82`	`Calculate maximum feature size for Dice similarity measure.`
`71`	`83`	`"""`
`@@ -98,6 +110,14 @@ function maximum_feature_size(measure::Overlap, db::AbstractSimStringDB, query_s`
`98`	`110`	`end`
`99`	`111`
`100`	`112`
	`113`	`+"""`
	`114`	`+Calculate maximum feature size for ExactMatch similarity measure.`
	`115`	`+"""`
	`116`	`+function maximum_feature_size(measure::ExactMatch, db::AbstractSimStringDB, query_size, α)`
	`117`	`+ return query_size`
	`118`	`+end`
	`119`	`+`
	`120`	`+`
`101`	`121`	`############## Similarity Score Per Measure ##############`
`102`	`122`	`"""`
`103`	`123`	`Calculate similarity score between X and Y using Dice similarity measure.`
`@@ -131,6 +151,13 @@ function similarity_score(measure::Overlap, X, Y)`
`131`	`151`	`end`
`132`	`152`
`133`	`153`
	`154`	`+"""`
	`155`	`+Calculate similarity score between X and Y using ExactMatch similarity measure.`
	`156`	`+"""`
	`157`	`+function similarity_score(measure::ExactMatch, X, Y)`
	`158`	`+ return Set(X) == Set(Y) ? 1.0 : 0.0`
	`159`	`+end`
	`160`	`+`
`134`	`161`
`135`	`162`	`############## Number of Minimum Overlaps Per Measure ##############`
`136`	`163`	`"""`
`@@ -166,4 +193,13 @@ using Overlap similarity measure.`
`166`	`193`	`"""`
`167`	`194`	`function minimum_overlap(measure::Overlap, query_size, candidate_size, α)`
`168`	`195`	`return ceil(Int, (α * min(query_size, candidate_size)) )`
	`196`	`+end`
	`197`	`+`
	`198`	`+`
	`199`	`+"""`
	`200`	`+Calculate the minimum overlap (τ) for a query size, candidate size, and α`
	`201`	`+using ExactMatch similarity measure.`
	`202`	`+"""`
	`203`	`+function minimum_overlap(measure::ExactMatch, query_size, candidate_size, α)`
	`204`	`+ return query_size`
`169`	`205`	`end`

`‎src/search.jl‎`

Lines changed: 7 additions & 30 deletions

Original file line number	Diff line number	Diff line change
`@@ -60,43 +60,22 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat`
`60`	`60`	`features = sort(features, by = i -> length(lookup_feature_set_by_size_feature(db_collection, candidate_size, i) ) )`
`61`	`61`
`62`	`62`	`# Count the occurrences of each feature`
`63`		`- candidate_match_counts = DefaultDict(0)`
`64`		`-`
	`63`	`+ candidate_match_counts = DefaultDict{String, Int}(0)`
`65`	`64`	`feature_slice_index = query_feature_length - τ + 1`
	`65`	`+ idx = query_feature_length - τ`
	`66`	`+ focus_features = feature_slice_index < 0 ? (@view features[0:end + feature_slice_index]) : (@view features[0:idx])`
`66`	`67`
`67`		`- if feature_slice_index < 0`
`68`		`- focus_features = features[1:end + feature_slice_index]`
`69`		`- else`
`70`		`- focus_features = features[1:feature_slice_index]`
`71`		`- end`
`72`		`-`
`73`		`- for i in focus_features`
	`68`	`+ @inbounds @views for i in focus_features`
`74`	`69`	`for s in lookup_feature_set_by_size_feature(db_collection, candidate_size, i)`
`75`	`70`	`candidate_match_counts[s] += 1`
`76`	`71`	`end`
`77`	`72`	`end`
`78`	`73`
`79`	`74`	`results = String[]`
`80`	`75`
`81`		`- # TODO: Return results in case of a perfect match??`
`82`		`- # if τ == 1`
`83`		`- # results = collect(keys(candidate_match_counts))`
`84`		`- # end`
`85`		`-`
`86`	`76`	`for (candidate, match_count) in candidate_match_counts`
`87`		`-`
`88`		`- for i in (query_feature_length - τ + 1) : query_feature_length - 1 # TODO: Verify`
`89`		`-`
`90`		`- if i < 0`
`91`		`- feature = features[end + i]`
`92`		`- elseif i == 0`
`93`		`- feature = features[i+1]`
`94`		`- else`
`95`		`- feature = features[i]`
`96`		`-`
`97`		`- end`
`98`		`-`
`99`		`- if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, feature)`
	`77`	`+ for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify`
	`78`	`+ if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])`
`100`	`79`	`match_count += 1`
`101`	`80`	`end`
`102`	`81`
`@@ -106,11 +85,9 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat`
`106`	`85`	`end`
`107`	`86`
`108`	`87`	`remaining_count = query_feature_length - i - 1`
`109`		`-`
`110`	`88`	`if (match_count + remaining_count) < τ`
`111`	`89`	`break`
`112`	`90`	`end`
`113`		`-`
`114`	`91`	`end`
`115`	`92`	`end`
`116`	`93`	`return results`
`@@ -133,7 +110,7 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer`
`133`	`110`	`results = String[]`
`134`	`111`
`135`	`112`	`# Generate and return results from the potential candidate size pool`
`136`		`- for candidate_size in min_feature_size:max_feature_size`
	`113`	`+ @inboundsfor candidate_size in min_feature_size:max_feature_size`
`137`	`114`	`# Minimum overlap`
`138`	`115`	`τ = minimum_overlap(measure, length_of_features, candidate_size, α)`
`139`	`116`

`‎test/test01_dictdb.jl‎`

Lines changed: 9 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,15 @@ end`
`64`	`64`
`65`	`65`
`66`	`66`
	`67`	`+@testset "Test describe functionality" begin`
	`68`	`+ db = DictDB(CharacterNGrams(2, " "));`
	`69`	`+ append!(db, ["foo", "bar", "fooo"]);`
	`70`	`+`
	`71`	`+ # Interact with db`
	`72`	`+ search(Dice(), db, "zep"; α=0.8, ranked=true)`
	`73`	`+`
	`74`	`+ @test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)`
	`75`	`+end`
`67`	`76`
`68`	`77`
`69`	`78`

`‎test/test02_features.jl‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -5,10 +5,10 @@ using Test`
`5`	`5`
`6`	`6`	`@testset "Test feature extraction" begin`
`7`	`7`	`char_ngram_res = SimString.extract_features(CharacterNGrams(3, " "), "prepress")`
`8`		`- @test char_ngram_res[6] == ("pre", 2)`
	`8`	`+ @test char_ngram_res[5] == ("pre", 2)`
`9`	`9`
`10`	`10`	`word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")`
`11`		`- @test word_ngram_res[6] == (("really", "really"), 2)`
	`11`	`+ @test word_ngram_res[5] == (("really", "really"), 2)`
`12`	`12`	`end`
`13`	`13`
`14`	`14`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 504460e

File tree

12 files changed

12 files changed

`‎docs/src/index.md‎`

`‎extras/examples.jl‎`

`‎extras/py_benchmarks.py‎`

`‎src/SimString.jl‎`

`‎src/dictdb.jl‎`

`‎src/features.jl‎`

`‎src/measures.jl‎`

`‎src/search.jl‎`

`‎test/test01_dictdb.jl‎`

`‎test/test02_features.jl‎`

0 commit comments