
Security News
TC39 Advances 11 Proposals for Math Precision, Binary APIs, and More
TC39 advances 11 JavaScript proposals, with two moving to Stage 4, bringing better math, binary APIs, and more features one step closer to the ECMAScript spec.
github.com/go-ego/gse
Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others. And supports with elasticsearch and bleve.
Gse is implements jieba by golang, and try add NLP support and more feature
gse-bind, binding JavaScript and other, support more language.
With Go module support (Go 1.11+), just import:
import "github.com/go-ego/gse"
Otherwise, to install the gse package, run the command:
go get -u github.com/go-ego/gse
package main
import (
_ "embed"
"fmt"
"github.com/go-ego/gse"
)
//go:embed testdata/test_en2.txt
var testDict string
//go:embed testdata/test_en.txt
var testEn string
var (
text = "To be or not to be, that's the question!"
test1 = "Hiworld, Helloworld!"
)
func main() {
var seg1 gse.Segmenter
seg1.DictSep = ","
err := seg1.LoadDict("./testdata/test_en.txt")
if err != nil {
fmt.Println("Load dictionary error: ", err)
}
s1 := seg1.Cut(text)
fmt.Println("seg1 Cut: ", s1)
// seg1 Cut: [to be or not to be , that's the question!]
var seg2 gse.Segmenter
seg2.AlphaNum = true
seg2.LoadDict("./testdata/test_en_dict3.txt")
s2 := seg2.Cut(test1)
fmt.Println("seg2 Cut: ", s2)
// seg2 Cut: [hi world , hello world !]
var seg3 gse.Segmenter
seg3.AlphaNum = true
seg3.DictSep = ","
err = seg3.LoadDictEmbed(testDict + "\n" + testEn)
if err != nil {
fmt.Println("loadDictEmbed error: ", err)
}
s3 := seg3.Cut(text + test1)
fmt.Println("seg3 Cut: ", s3)
// seg3 Cut: [to be or not to be , that's the question! hi world , hello world !]
// example2()
}
Example2:
package main
import (
"fmt"
"regexp"
"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
)
var (
text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."
new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")
seg gse.Segmenter
posSeg pos.Segmenter
)
func main() {
// Loading the default dictionary
seg.LoadDict()
// Loading the default dictionary with embed
// seg.LoadDictEmbed()
//
// Loading the Simplified Chinese dictionary
// seg.LoadDict("zh_s")
// seg.LoadDictEmbed("zh_s")
//
// Loading the Traditional Chinese dictionary
// seg.LoadDict("zh_t")
//
// Loading the Japanese dictionary
// seg.LoadDict("jp")
//
// Load the dictionary
// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
cut()
segCut()
}
func cut() {
hmm := new.Cut(text, true)
fmt.Println("cut use hmm: ", hmm)
hmm = new.CutSearch(text, true)
fmt.Println("cut search use hmm: ", hmm)
fmt.Println("analyze: ", new.Analyze(hmm, text))
hmm = new.CutAll(text)
fmt.Println("cut all: ", hmm)
reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
hmm = seg.CutDAG(text1, reg)
fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])
}
func analyzeAndTrim(cut []string) {
a := seg.Analyze(cut, "")
fmt.Println("analyze the segment: ", a)
cut = seg.Trim(cut)
fmt.Println("cut all: ", cut)
fmt.Println(seg.String(text, true))
fmt.Println(seg.Slice(text, true))
}
func cutPos() {
po := seg.Pos(text, true)
fmt.Println("pos: ", po)
po = seg.TrimPos(po)
fmt.Println("trim pos: ", po)
pos.WithGse(seg)
po = posSeg.Cut(text, true)
fmt.Println("pos: ", po)
po = posSeg.TrimWithPos(po, "zg")
fmt.Println("trim pos: ", po)
}
func segCut() {
// Text Segmentation
tb := []byte(text)
fmt.Println(seg.String(text, true))
segments := seg.Segment(tb)
// Handle word segmentation results, search mode
fmt.Println(gse.ToString(segments, true))
}
Look at an custom dictionary example
package main
import (
"fmt"
_ "embed"
"github.com/go-ego/gse"
)
//go:embed test_en_dict3.txt
var testDict string
func main() {
// var seg gse.Segmenter
// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
// seg.LoadStop()
seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
// seg.LoadDictEmbed()
seg.LoadStopEmbed()
text1 := "Hello world, こんにちは世界, 你好世界!"
s1 := seg.Cut(text1, true)
fmt.Println(s1)
fmt.Println("trim: ", seg.Trim(s1))
fmt.Println("stop: ", seg.Stop(s1))
fmt.Println(seg.String(text1, true))
segments := seg.Segment([]byte(text1))
fmt.Println(gse.ToString(segments))
}
How to use it with elasticsearch?
Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". See LICENSE-APACHE, LICENSE-MIT.
FAQs
Unknown package
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
TC39 advances 11 JavaScript proposals, with two moving to Stage 4, bringing better math, binary APIs, and more features one step closer to the ECMAScript spec.
Research
/Security News
A flawed sandbox in @nestjs/devtools-integration lets attackers run code on your machine via CSRF, leading to full Remote Code Execution (RCE).
Product
Customize license detection with Socket’s new license overlays: gain control, reduce noise, and handle edge cases with precision.