Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 504460e

Browse files
Zero index (#6)
* Beta release (0-index implementation) * Added basic summary description for dictdb collection
1 parent 6f71152 commit 504460e

File tree

12 files changed

+152
-59
lines changed

12 files changed

+152
-59
lines changed

‎docs/src/index.md‎

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] Support for unicodes
1717
- [ ] Custom user defined feature generation methods
1818
- [ ] Mecab-based tokenizer support
19+
- [ ] Support for building databases directly from text files
20+
- [ ] Support for persistent databases
1921

2022
## Suported String Similarity Measures
2123

2224
- [X] Dice coefficient
2325
- [X] Jaccard coefficient
2426
- [X] Cosine coefficient
2527
- [X] Overlap coefficient
28+
- [X] Exact match
2629

2730
## Installation
2831

@@ -67,7 +70,9 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
6770
# ("foo", 1.0)
6871
# ("fooo", 0.8888888888888888)
6972

70-
73+
# Describe a working database collection
74+
desc = describe_collection(db)
75+
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
7176
```
7277

7378
## TODO: Benchmarks

‎extras/examples.jl‎

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@ push!(db, "foo");
2121
push!(db, "bar");
2222
push!(db, "fooo");
2323

24-
f(x, c, s) = search(x, c, s)
24+
f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
2525
test = "foo";
2626
col = db;
2727
sim = Cosine();
28+
a = 0.8;
29+
r = true;
2830

29-
f(Cosine(), db, "foo")
31+
f(Cosine(), db, "foo", 0.8, true)
3032

31-
@btime f($sim, $col, $test)
33+
@btime f($sim, $col, $test, $a, $r)
3234
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
3335

3436

‎extras/py_benchmarks.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ def f(x):
1313
for i in x:
1414
db.add(i)
1515

16-
# %time f(fake_names)
16+
# %time f(fake_names)

‎src/SimString.jl‎

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ module SimString
22

33
import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
5-
#using ProgressMeter
6-
#using CircularArrays
7-
#using OffsetArrays
5+
using ProgressMeter
6+
using CircularArrays
7+
using OffsetArrays
88

99
######### Import modules & utils ################
1010
include("db_collection.jl")
@@ -16,8 +16,8 @@ include("search.jl")
1616

1717

1818
####### Global export of user API #######
19-
export Dice, Jaccard, Cosine, Overlap,
20-
AbstractSimStringDB, DictDB,
19+
export Dice, Jaccard, Cosine, Overlap, ExactMatch,
20+
AbstractSimStringDB, DictDB, describe_collection,
2121
CharacterNGrams, WordNGrams,
2222
search
2323

‎src/dictdb.jl‎

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,28 +75,49 @@ end
7575

7676
################################## DictDB UTIL Functions ############################
7777
"""
78-
Internal function for retrieving existing features by size
78+
describe_collection(db::DictDB)
79+
80+
Basic summary stats for the DB
81+
82+
# Arguments
83+
* `db`: DictDB object
84+
85+
# Example
86+
```julia
87+
db = DictDB(CharacterNGrams(2, " "));
88+
append!(db, ["foo", "bar", "fooo"]);
89+
describe_collection(db)
90+
91+
# Returns
92+
* NamedTuples: Summary stats for the DB
93+
```
94+
7995
"""
80-
function retrieve_existing_feature_by_size(db::DictDB, size, feature)
81-
return db.string_feature_map[size][feature]
82-
end
96+
function describe_collection(db::DictDB)
8397

98+
# Total number of strings in collection
99+
= length(db.string_collection)
84100

85-
# """
86-
# Basic summary stats for the DB
87-
# """
88-
# function describe_db(db::DictDB)
101+
# Average number of ngram features
102+
n = [x for x in keys(db.string_size_map)]
103+
μ = sum(n) / length(n)
89104

90-
# end
105+
# Total number of ngram features
106+
total_ngrams = 0
107+
for i in values(db.string_feature_map)
108+
total_ngrams += length(i)
109+
end
110+
111+
return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
112+
end
91113

92114

93115
"""
94116
Internal function to lookup feature sets by size and feature
95117
"""
96118
function lookup_feature_set_by_size_feature(db::DictDB, size, feature)
97-
# TODO: Clean this up and make it more efficient. Shouldn't updated db.string_feature_map
98119
if feature keys(db.lookup_cache[size])
99-
db.lookup_cache[size][feature] = retrieve_existing_feature_by_size(db, size, feature)
120+
db.lookup_cache[size][feature] = get(db.string_feature_map[size], feature, Set{String}())
100121
end
101122
return db.lookup_cache[size][feature]
102123
end

‎src/features.jl‎

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ end
4141
Internal function to create character-level ngrams features from an AbstractString
4242
"""
4343
function n_grams(extractor::CharacterNGrams, x, n)
44-
# Return counted n-grams (including duplicates)
4544
return cummulative_ngram_count(init_ngrams(extractor, x, n))
4645
end
4746

@@ -54,13 +53,21 @@ function n_grams(extractor::WordNGrams, x, n)
5453
end
5554

5655

56+
"""
57+
Internal function to make zero indexed circular arrays
58+
"""
59+
function make_zero_index_circular_array(x)
60+
return CircularArray(OffsetArray(x, 0:length(x)-1))
61+
end
62+
63+
5764
"""
5865
Internal function to generate character-level ngrams features from an AbstractString
5966
"""
6067
function extract_features(extractor::CharacterNGrams, str)
6168
n = extractor.n - 1 == 0 ? 1 : extractor.n - 1
6269
str = pad_string(str, repeat(extractor.padder, n))
63-
return n_grams(extractor, str, extractor.n)
70+
return make_zero_index_circular_array(n_grams(extractor, str, extractor.n))
6471
end
6572

6673

@@ -70,7 +77,7 @@ Internal function to generate word-level ngrams features from an AbstractString
7077
function extract_features(extractor::WordNGrams, str)
7178
words_split = split(str, extractor.splitter)
7279
padded_words = pad_string(words_split, extractor.padder)
73-
return n_grams(extractor, padded_words, extractor.n)
80+
return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
7481
end
7582

7683

@@ -80,16 +87,14 @@ Internal function to count and pad generated character-level ngrams (including d
8087
function cummulative_ngram_count(x)
8188
counter = Dict{eltype(x), Int}()
8289

83-
unique_list = map(x) do val
90+
return map(x) do val
8491
if val in keys(counter)
8592
counter[val] += 1
8693
else
8794
counter[val] = 1
8895
end
8996
(val, counter[val])
9097
end
91-
92-
return unique_list
9398
end
9499

95100

‎src/measures.jl‎

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ Overlap Similarity Measure.
3030
struct Overlap <: AbstractSimilarityMeasure end
3131

3232

33+
"""
34+
Exact Match Similarity Measure.
35+
"""
36+
struct ExactMatch <: AbstractSimilarityMeasure end
37+
3338

3439
############## Minimum Feature Sizes Per Measure ##############
3540
"""
@@ -64,8 +69,15 @@ function minimum_feature_size(measure::Overlap, query_size, α)
6469
end
6570

6671

72+
"""
73+
Calculate minimum feature size for ExactMatch similarity measure.
74+
"""
75+
function minimum_feature_size(measure::ExactMatch, query_size, α)
76+
return query_size
77+
end
6778
############## Maximum Feature Size Per Measure ##############
6879

80+
6981
"""
7082
Calculate maximum feature size for Dice similarity measure.
7183
"""
@@ -98,6 +110,14 @@ function maximum_feature_size(measure::Overlap, db::AbstractSimStringDB, query_s
98110
end
99111

100112

113+
"""
114+
Calculate maximum feature size for ExactMatch similarity measure.
115+
"""
116+
function maximum_feature_size(measure::ExactMatch, db::AbstractSimStringDB, query_size, α)
117+
return query_size
118+
end
119+
120+
101121
############## Similarity Score Per Measure ##############
102122
"""
103123
Calculate similarity score between X and Y using Dice similarity measure.
@@ -131,6 +151,13 @@ function similarity_score(measure::Overlap, X, Y)
131151
end
132152

133153

154+
"""
155+
Calculate similarity score between X and Y using ExactMatch similarity measure.
156+
"""
157+
function similarity_score(measure::ExactMatch, X, Y)
158+
return Set(X) == Set(Y) ? 1.0 : 0.0
159+
end
160+
134161

135162
############## Number of Minimum Overlaps Per Measure ##############
136163
"""
@@ -166,4 +193,13 @@ using Overlap similarity measure.
166193
"""
167194
function minimum_overlap(measure::Overlap, query_size, candidate_size, α)
168195
return ceil(Int, (α * min(query_size, candidate_size)) )
196+
end
197+
198+
199+
"""
200+
Calculate the minimum overlap (τ) for a query size, candidate size, and α
201+
using ExactMatch similarity measure.
202+
"""
203+
function minimum_overlap(measure::ExactMatch, query_size, candidate_size, α)
204+
return query_size
169205
end

‎src/search.jl‎

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -60,43 +60,22 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
6060
features = sort(features, by = i -> length(lookup_feature_set_by_size_feature(db_collection, candidate_size, i) ) )
6161

6262
# Count the occurrences of each feature
63-
candidate_match_counts = DefaultDict(0)
64-
63+
candidate_match_counts = DefaultDict{String, Int}(0)
6564
feature_slice_index = query_feature_length - τ + 1
65+
idx = query_feature_length - τ
66+
focus_features = feature_slice_index < 0 ? (@view features[0:end + feature_slice_index]) : (@view features[0:idx])
6667

67-
if feature_slice_index < 0
68-
focus_features = features[1:end + feature_slice_index]
69-
else
70-
focus_features = features[1:feature_slice_index]
71-
end
72-
73-
for i in focus_features
68+
@inbounds @views for i in focus_features
7469
for s in lookup_feature_set_by_size_feature(db_collection, candidate_size, i)
7570
candidate_match_counts[s] += 1
7671
end
7772
end
7873

7974
results = String[]
8075

81-
# TODO: Return results in case of a perfect match??
82-
# if τ == 1
83-
# results = collect(keys(candidate_match_counts))
84-
# end
85-
8676
for (candidate, match_count) in candidate_match_counts
87-
88-
for i in (query_feature_length - τ + 1) : query_feature_length - 1 # TODO: Verify
89-
90-
if i < 0
91-
feature = features[end + i]
92-
elseif i == 0
93-
feature = features[i+1]
94-
else
95-
feature = features[i]
96-
97-
end
98-
99-
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, feature)
77+
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
78+
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
10079
match_count += 1
10180
end
10281

@@ -106,11 +85,9 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
10685
end
10786

10887
remaining_count = query_feature_length - i - 1
109-
11088
if (match_count + remaining_count) < τ
11189
break
11290
end
113-
11491
end
11592
end
11693
return results
@@ -133,7 +110,7 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
133110
results = String[]
134111

135112
# Generate and return results from the potential candidate size pool
136-
for candidate_size in min_feature_size:max_feature_size
113+
@inboundsfor candidate_size in min_feature_size:max_feature_size
137114
# Minimum overlap
138115
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
139116

‎test/test01_dictdb.jl‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,15 @@ end
6464

6565

6666

67+
@testset "Test describe functionality" begin
68+
db = DictDB(CharacterNGrams(2, " "));
69+
append!(db, ["foo", "bar", "fooo"]);
70+
71+
# Interact with db
72+
search(Dice(), db, "zep"; α=0.8, ranked=true)
73+
74+
@test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
75+
end
6776

6877

6978

‎test/test02_features.jl‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ using Test
55

66
@testset "Test feature extraction" begin
77
char_ngram_res = SimString.extract_features(CharacterNGrams(3, " "), "prepress")
8-
@test char_ngram_res[6] == ("pre", 2)
8+
@test char_ngram_res[5] == ("pre", 2)
99

1010
word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
11-
@test word_ngram_res[6] == (("really", "really"), 2)
11+
@test word_ngram_res[5] == (("really", "really"), 2)
1212
end
1313

1414

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /