Mecab support #24

Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,7 +16,7 @@ jobs:
		os:
		- ubuntu-latest
		- macOS-latest
	- windows-latest
	# - windows-latest
		arch:
		- x64
		steps:
Expand All		@@ -35,6 +35,11 @@ jobs:
		${{ runner.os }}-test-${{ env.cache-name }}-
		${{ runner.os }}-test-
		${{ runner.os }}-
	- name: Install Mecab (MacOS)
	if: runner.os == 'macOS'
	run: \|
	brew install mecab
	brew install mecab-ipadic
		- uses: julia-actions/julia-buildpkg@v1
		- uses: julia-actions/julia-runtest@v1
		- uses: julia-actions/julia-processcoverage@v1
Expand Down

6 changes: 4 additions & 2 deletions Project.toml

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -1,23 +1,25 @@
		name = "SimString"
		uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4"
		authors = ["Bernard Brenyah"]
	version = "0.2.0"
	version = "0.3.0"

		[deps]
		CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
		DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
		OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
	Wakame = "4447db07-3941-47e2-90a2-965b7cb1b6ce"

		[compat]
		CircularArrays = "1"
		DataStructures = "0.18"
		OffsetArrays = "1"
		julia = "1"
	Wakame = "0.1"

		[extras]
		Faker = "0efc519c-db33-5916-ab87-703215c3906f"
	Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
		Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
	Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

		[targets]
		test = ["Test", "Faker", "Suppressor"]

5 changes: 2 additions & 3 deletions README.md

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,9 +16,8 @@ This package is be particulary useful for natural language processing tasks whic
		- [X] 100% exact retrieval
		- [X] Support for unicodes
		- [X] Support for building databases directly from text files
	- [ ] Custom user defined feature generation methods
	- [ ] Mecab-based tokenizer support
	- [ ] Support for persistent databases
	- [X] Mecab-based tokenizer support
	- [ ] Support for persistent databases like MongoDB

		## Suported String Similarity Measures

Expand Down

10 changes: 6 additions & 4 deletions docs/src/index.md

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,9 +17,8 @@ CPMerge Paper: [https://aclanthology.org/C10-1096/](https://aclanthology.org/C10
		- [X] 100% exact retrieval
		- [X] Support for unicodes
		- [X] Support for building databases directly from text files
	- [ ] Custom user defined feature generation methods
	- [ ] Mecab-based tokenizer support
	- [ ] Support for persistent databases
	- [X] Mecab-based tokenizer support for Japanese
	- [ ] Support for persistent databases like MongoDB

		## Suported String Similarity Measures

Expand Down Expand Up		@@ -59,7 +58,9 @@ pkg> free SimString
		using SimString

		# Inilisate database and some strings
	db = DictDB(CharacterNGrams(2, " "));
	db = DictDB(CharacterNGrams(2, " "));
	# OR: db = DictDB(WordNGrams(2, " ")); for word based ngrams
	# OR db = DictDB(MecabNGrams(2, " ", Mecab())) for Japanese ngrams. Requires installation of Mecab
		push!(db, "foo");
		push!(db, "bar");
		push!(db, "fooo");
Expand All		@@ -85,6 +86,7 @@ desc = describe_collection(db)

		- 0.1.0 Initial release.
		- 0.2.0 Added support for unicodes
	- 0.3.0 Added Japanese support via Mecab

		```@index
		```
Expand Down

3 changes: 2 additions & 1 deletion src/SimString.jl

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,6 +4,7 @@ import Base: push!, append!
		using DataStructures: DefaultOrderedDict, DefaultDict
		using CircularArrays
		using OffsetArrays
	using Wakame

		######### Import modules & utils ################
		include("db_collection.jl")
Expand All		@@ -17,7 +18,7 @@ include("search.jl")
		####### Global export of user API #######
		export Dice, Jaccard, Cosine, Overlap, ExactMatch,
		AbstractSimStringDB, DictDB, describe_collection,
	CharacterNGrams, WordNGrams,
	CharacterNGrams, WordNGrams, MecabNGrams,
		search


Expand Down

11 changes: 9 additions & 2 deletions src/db_collection.jl

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,8 +12,6 @@ Abstract type for feature extraction structs
		abstract type FeatureExtractor end


	# Feature Extraction Definitions

		"""
		Feature extraction on character-level ngrams
		"""
Expand All		@@ -33,3 +31,12 @@ struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
		end



	"""
	Feature extraction based on mecab word-level ngrams
	"""
	struct MecabNGrams{T1<:Int, T2<:AbstractString, T3<:Mecab} <: FeatureExtractor
	n::T1 # number of n-grams to extract
	padder::T2 # string to use to pad n-grams
	tokenizer::T3 # Mecab tokenizer to use
	end

66 changes: 46 additions & 20 deletions src/dictdb.jl

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -44,6 +44,19 @@ function DictDB(x::CharacterNGrams)
		end


	"""
	Internal function for generating a base DictDB object for WordNGrams and MecabNGrams
	"""
	function generate_base_dict_db(x)
	DictDB(
	x,
	String[],
	DefaultDict{Int, Set{String}}( () -> Set{String}() ),
	DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
	DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
	)
	end

		"""
		DictDB(x::WordNGrams)

Expand All		@@ -60,15 +73,28 @@ db = DictDB(WordNGrams(2, " ", " "))
		# Returns
		* `DictDB`: A DictDB object with additional containers and Metadata for WordNGrams
		"""
	function DictDB(x::WordNGrams)
	DictDB(
	x,
	String[],
	DefaultDict{Int, Set{String}}( () -> Set{String}() ),
	DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
	DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
	)
	end
	DictDB(x::WordNGrams) = generate_base_dict_db(x)



	"""
	DictDB(x::MecabNGrams)

	Initialize a dict DB with additional containers and Metadata for MecabNGrams

	# Arguments
	* `x`: MecabNGrams object

	# Example
	```julia
	db = DictDB(MecabNGrams(2, " ", Mecab()))
	```

	# Returns
	* `DictDB`: A DictDB object with additional containers and Metadata for MecabNGrams
	"""
	DictDB(x::MecabNGrams) = generate_base_dict_db(x)




Expand Down Expand Up		@@ -96,20 +122,20 @@ describe_collection(db)
		"""
		function describe_collection(db::DictDB)

	# Total number of strings in collection
	∑ = length(db.string_collection)
	# Total number of strings in collection
	∑ = length(db.string_collection)

	# Average size of ngram features
	n = [x for x in keys(db.string_size_map)]
	μ = sum(n) / length(n)
	# Average size of ngram features
	n = [x for x in keys(db.string_size_map)]
	μ = sum(n) / length(n)

	# Total number of ngram features
	total_ngrams = 0
	for i in values(db.string_feature_map)
	total_ngrams += length(i)
	end
	# Total number of ngram features
	total_ngrams = 0
	for i in values(db.string_feature_map)
	total_ngrams += length(i)
	end

	return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
	return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
		end


Expand Down

35 changes: 22 additions & 13 deletions src/features.jl

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -36,29 +36,20 @@ function init_ngrams(extractor::CharacterNGrams, x, n)
		end



		"""
	Internal function to generate intial uncounted ngrams on a word level
	Internal function to generate intial uncounted word ngrams on a word level
		"""
	function init_ngrams(extractor::WordNGrams, x, n)
	function init_ngrams(extractor, x, n)
		map(0:length(x)-n) do i
		@view x[i+1: i+n]
		end
		end


		"""
	Internal function to create character-level ngrams features from an AbstractString
	"""
	function n_grams(extractor::CharacterNGrams, x, n)
	return cummulative_ngram_count(init_ngrams(extractor, x, n))
	end


	"""
	Internal function to create word-level ngrams from an AbstractVector
	Internal function to create counted ngrams
		"""
	function n_grams(extractor::WordNGrams, x, n)
	function n_grams(extractor, x, n)
		return cummulative_ngram_count(init_ngrams(extractor, x, n))
		end

Expand Down Expand Up		@@ -91,6 +82,24 @@ function extract_features(extractor::WordNGrams, str)
		end


	"""
	Internal function to generate Mecab word-level ngrams features from an AbstractString
	"""
	function extract_features(extractor::MecabNGrams, str)
	words_split = tokenize(extractor.tokenizer, str)
	padded_words = pad_string(words_split, extractor.padder)
	return make_zero_index_circular_array(n_grams(extractor, padded_words, extractor.n))
	end


	"""
	Internal function to tokenize a string using Mecab
	"""
	function tokenize(tokenizer::Mecab, str::AbstractString)
	return parse_surface(tokenizer, str)
	end


		"""
		Internal function to count and pad generated character-level ngrams (including duplicates)
		"""
Expand Down

10 changes: 10 additions & 0 deletions test/test01_dictdb.jl

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,6 @@
		module TestDBCollection
		using SimString
	using Wakame: Mecab
		using Test


Expand Down Expand Up		@@ -110,5 +111,14 @@ end



	@testset "Test mecab insert" begin
	db = DictDB(MecabNGrams(2, " ", Mecab()))
	append!(db, ["pythonが大好きです", "I am a cat."])

	@test db.string_collection == ["pythonが大好きです", "I am a cat."]
	@test db.string_size_map[5] == Set(["pythonが大好きです"])
	@test db.string_size_map[6] == Set(["I am a cat."])
	end


		end # module

5 changes: 5 additions & 0 deletions test/test02_features.jl

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,6 @@
		module TestFeatures
		using SimString
	using Wakame: Mecab
		using Test


Expand All		@@ -13,6 +14,10 @@ using Test
		word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude 😄🍕")
		@test word_ngram_res[5] == (["really", "really"], 2)
		@test word_ngram_res[8] == (["dude", "😄🍕"], 1)

	mecab_ngram_res = SimString.extract_features(MecabNGrams(2, " ", Mecab()), "pythonが大好きです")
	@test mecab_ngram_res[1] == (["python", "が"], 1)
	@test mecab_ngram_res[2] == (["が", "大好き"], 1)
		end


Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Mecab support #24

Uh oh!

Mecab support #24

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing