Name	Name	Last commit message	Last commit date
Latest commit History 663 Commits
.github	.github
crf	crf
data	data
examples	examples
gonn	gonn
hmm	hmm
testdata	testdata
tf	tf
tools	tools
.gitignore	.gitignore
.travis.yml	.travis.yml
CONTRIBUTING.md	CONTRIBUTING.md
LICENSE	LICENSE
README.md	README.md
README_zh.md	README_zh.md
circle.yml	circle.yml
dag.go	dag.go
dict_1.16.go	dict_1.16.go
dict_1.16_test.go	dict_1.16_test.go
dict_util.go	dict_util.go
dictionary.go	dictionary.go
go.mod	go.mod
go.sum	go.sum
gse.go	gse.go
gse_bm_test.go	gse_bm_test.go
gse_test.go	gse_test.go
seg_utils.go	seg_utils.go
segmenter.go	segmenter.go
segmenter_test.go	segmenter_test.go
stop.go	stop.go
token.go	token.go
token_test.go	token_test.go
trim.go	trim.go

gse

Go efficient multilingual NLP and text segmentation; support english, chinese, japanese and other. And supports with elasticsearch and bleve.

Build Status CircleCI Status codecov Build Status Go Report Card GoDoc GitHub release Join the chat at https://gitter.im/go-ego/ego

简体中文

Gse is implements jieba by golang, and try add NLP support and more feature

Feature:

Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes;
Support user and embed dictionary, Part-of-speech/POS tagging, analyze segment info, stop and trim words
Support multilingual: English, Chinese, Japanese and other
Support traditional chinese
Support HMM cut text use Viterbi algorithm
Support NLP by TensorFlow (in work)
Named Entity Recognition (in work)
Supports with elasticsearch and bleve
run JSON RPC service.

Algorithm:

Dictionary with double array trie (Double-Array Trie) to achieve
Segmenter algorithm is the shortest path (based on word frequency and dynamic programming), and DAG and HMM algorithm word segmentation.

Text Segmentation speed:

single thread 9.2MB/s
goroutines concurrent 26.8MB/s.
HMM text segmentation single thread 3.2MB/s. (2core 4threads Macbook Pro).

Binding:

gse-bind, binding JavaScript and other, support more language.

Install / update

go get -u github.com/go-ego/gse

Use

package main
import (
	"fmt"
	"regexp"
	"github.com/go-ego/gse"
	"github.com/go-ego/gse/hmm/pos"
)
var (
	text = "Hello world, Helloworld. Winter is coming! 你好世界."
	new, _ = gse.New("zh,testdata/test_dict3.txt", "alpha")
	seg gse.Segmenter
	posSeg pos.Segmenter
)
func main() {
	// Loading the default dictionary
	seg.LoadDict()
	// Loading the default dictionary with embed
	// seg.LoadDictEmbed()
	// 
	// Loading the simple chinese dictionary
	// seg.LoadDict("zh_s")
	// seg.LoadDictEmbed("zh_s")
	//
	// Loading the traditional chinese dictionary
	// seg.LoadDict("zh_t")
	// 
	// Loading the japanese dictionary
	// seg.LoadDict("jp")
	// 
	// Load the dictionary
	// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
	cut()
	segCut()
}
func cut() {
	hmm := new.Cut(text, true)
	fmt.Println("cut use hmm: ", hmm)
	hmm = new.CutSearch(text, true)
	fmt.Println("cut search use hmm: ", hmm)
	fmt.Println("analyze: ", new.Analyze(hmm, text))
	hmm = new.CutAll(text)
	fmt.Println("cut all: ", hmm)
	reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
	text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
	hmm = seg.CutDAG(text1, reg)
	fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])
}
func analyzeAndTrim(cut []string) {
	a := seg.Analyze(cut, "")
	fmt.Println("analyze the segment: ", a)
	cut = seg.Trim(cut)
	fmt.Println("cut all: ", cut)
	fmt.Println(seg.String(text, true))
	fmt.Println(seg.Slice(text, true))
}
func cutPos() {
	po := seg.Pos(text, true)
	fmt.Println("pos: ", po)
	po = seg.TrimPos(po)
	fmt.Println("trim pos: ", po)
	pos.WithGse(seg)
	po = posSeg.Cut(text, true)
	fmt.Println("pos: ", po)
	po = posSeg.TrimWithPos(po, "zg")
	fmt.Println("trim pos: ", po)
}
func segCut() {
	// Text Segmentation
	tb := []byte(text)
	fmt.Println(seg.String(text, true))
	segments := seg.Segment(tb)
	// Handle word segmentation results, search mode
	fmt.Println(gse.ToString(segments, true))
}

Look at an custom dictionary example

package main
import (
	"fmt"
	_ "embed"
	"github.com/go-ego/gse"
)
//go:embed test_dict3.txt
var testDict string
func main() {
	// var seg gse.Segmenter
	// seg.LoadDict("zh, testdata/test_dict.txt, testdata/test_dict1.txt")
	// seg.LoadStop()
	seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
	// seg.LoadDictEmbed()
	seg.LoadStopEmbed()
	text1 := "你好世界, Hello world"
	fmt.Println(seg.Cut(text1, true))
	fmt.Println(seg.String(text1, true))
	segments := seg.Segment([]byte(text1))
	fmt.Println(gse.ToString(segments))
}

Look at an Chinese example

Look at an Japanese example

Elasticsearch

How to use it with elasticsearch?

go-gse-elastic

Authors

License

Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". See LICENSE-APACHE, LICENSE-MIT.

Thanks for sego and jieba(jiebago).

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

gopher-lego/gse

Folders and files

Latest commit

History

Repository files navigation

gse

Feature:

Algorithm:

Text Segmentation speed:

Binding:

Install / update

Use

Elasticsearch

Authors

License

About

Resources

License

Contributing

Uh oh!

Stars

Watchers

Forks

Releases

Packages

Uh oh!

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

gse

Feature:

Algorithm:

Text Segmentation speed:

Binding:

Install / update

Use

Elasticsearch

Authors

License

About

Resources

License

Contributing

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Uh oh!

Contributors

Uh oh!

Languages

Packages