@node-rs/jieba
Advanced tools
| export const dict: Uint8Array | ||
| export const idf: Uint8Array |
+5
| const fs = require('fs') | ||
| const { join } = require('path') | ||
| module.exports.dict = fs.readFileSync(join(__dirname, 'dict.txt')) | ||
| module.exports.idf = fs.readFileSync(join(__dirname, 'idf.txt')) |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
+125
-13
| /* auto-generated by NAPI-RS */ | ||
| /* eslint-disable */ | ||
| export declare function cut(sentence: string | Uint8Array, hmm?: boolean | undefined | null): string[] | ||
| export declare class Jieba { | ||
| /** Create a new instance with empty dict */ | ||
| constructor() | ||
| /** | ||
| * Create a new instance with dict | ||
| * | ||
| * With the default dict, you can use `dict` from `@node-rs/jieba/dict`: | ||
| * ```js | ||
| * import { Jieba } from '@node-rs/jieba' | ||
| * import { dict } from '@node-rs/jieba/dict' | ||
| * | ||
| * const jieba = Jieba.withDict(dict) | ||
| * ``` | ||
| */ | ||
| static withDict(dict: Uint8Array): Jieba | ||
| /** Load dictionary after initialization */ | ||
| loadDict(dict: Uint8Array): void | ||
| /** | ||
| * Cut the input text | ||
| * | ||
| * ## Params | ||
| * | ||
| * `sentence`: input text | ||
| * | ||
| * `hmm`: enable HMM or not | ||
| */ | ||
| cut(sentence: string | Uint8Array, hmm?: boolean | undefined | null): string[] | ||
| /** Cut the input text asynchronously */ | ||
| cutAsync(sentence: string | Uint8Array, hmm?: boolean | undefined | null, signal?: AbortSignal | undefined | null): Promise<unknown> | ||
| /** | ||
| * Cut the input text, return all possible words | ||
| * | ||
| * ## Params | ||
| * | ||
| * `sentence`: input text | ||
| */ | ||
| cutAll(sentence: string | Uint8Array): string[] | ||
| /** | ||
| * Cut the input text in search mode | ||
| * | ||
| * ## Params | ||
| * | ||
| * `sentence`: input text | ||
| * | ||
| * `hmm`: enable HMM or not | ||
| */ | ||
| cutForSearch(sentence: string | Uint8Array, hmm?: boolean | undefined | null): string[] | ||
| /** | ||
| * Tag the input text | ||
| * | ||
| * ## Params | ||
| * | ||
| * `sentence`: input text | ||
| * | ||
| * `hmm`: enable HMM or not | ||
| */ | ||
| tag(sentence: string | Uint8Array, hmm?: boolean | undefined | null): Array<TaggedWord> | ||
| } | ||
| export declare function cutAll(sentence: string | Uint8Array): string[] | ||
| export declare class TfIdf { | ||
| static withDict(dict: Uint8Array): TfIdf | ||
| /** Creates an TfIdf. */ | ||
| constructor() | ||
| /** | ||
| * Merges entires from `dict` into the `idf_dict`. | ||
| * ```js | ||
| * import { Jieba, TfIdf } from '@node-rs/jieba'; | ||
| * | ||
| * import { dict, idf } from '@node-rs/jieba/dict'; | ||
| * | ||
| * // Create default Jieba instance | ||
| * const jieba = Jieba.withDict(dict); | ||
| * | ||
| * // Create TfIdf instance and load initial dictionary | ||
| * let initIdf = "生化学 13.900677652 | ||
| "; | ||
| * const tfidf = new TfIdf(); | ||
| * tfidf.loadDict(Buffer.from(initIdf)); | ||
| * | ||
| * // Extract keywords with initial dictionary | ||
| * const text = "生化学不是光化学的,"; | ||
| * const topK = jieba.extract(text, 3); | ||
| * // Result would be like: | ||
| * // [ | ||
| * // { keyword: '不是', weight: 4.6335592173333335 }, | ||
| * // { keyword: '光化学', weight: 4.6335592173333335 }, | ||
| * // { keyword: '生化学', weight: 4.6335592173333335 } | ||
| * // ] | ||
| * | ||
| * // Load new dictionary with different weights | ||
| * let newIdf = "光化学 99.123456789 | ||
| "; | ||
| * tfidf.loadDict(Buffer.from(newIdf)); | ||
| * | ||
| * // Extract keywords again with updated dictionary | ||
| * const newTopK = jieba.extract(text, 3); | ||
| * // Result would be like: | ||
| * // [ | ||
| * // { keyword: '不是', weight: 33.041152263 }, | ||
| * // { keyword: '光化学', weight: 33.041152263 }, | ||
| * // { keyword: '生化学', weight: 4.6335592173333335 } | ||
| * // ] | ||
| * ``` | ||
| */ | ||
| loadDict(dict: Uint8Array): void | ||
| setConfig(config: KeywordExtractConfig): void | ||
| /** | ||
| * Uses TF-IDF algorithm to extract the `top_k` keywords from `sentence`. | ||
| * | ||
| * If `allowed_pos` is not empty, then only terms matching those parts if | ||
| * speech are considered. | ||
| */ | ||
| extractKeywords(jieba: Jieba, sentence: string, topK: number, allowedPos?: Array<string> | undefined | null): Array<Keyword> | ||
| } | ||
| export declare function cutForSearch(sentence: string | Uint8Array, hmm?: boolean | undefined | null): string[] | ||
| export declare function extract(sentence: string | Uint8Array, topn: number, allowedPos?: string | undefined | null): Array<Keyword> | ||
| export interface Keyword { | ||
@@ -16,10 +123,15 @@ keyword: string | ||
| export declare function load(): void | ||
| /** | ||
| * Creates a KeywordExtractConfig state that contains filter criteria as | ||
| * well as segmentation configuration for use by keyword extraction | ||
| * implementations. | ||
| */ | ||
| export interface KeywordExtractConfig { | ||
| stopWords?: Set<string> | undefined | ||
| /** Any segments less than this length will not be considered a Keyword */ | ||
| minKeywordLength?: number | ||
| /** If true, fall back to hmm model if segment cannot be found in the dictionary */ | ||
| useHmm?: boolean | ||
| } | ||
| export declare function loadDict(dict: Uint8Array): void | ||
| export declare function loadTFIDFDict(dict: Uint8Array): void | ||
| export declare function tag(sentence: string | Uint8Array, hmm?: boolean | undefined | null): Array<TaggedWord> | ||
| export interface TaggedWord { | ||
@@ -26,0 +138,0 @@ tag: string |
+6
-9
| // prettier-ignore | ||
| /* eslint-disable */ | ||
| // @ts-nocheck | ||
| /* auto-generated by NAPI-RS */ | ||
| const { readFileSync } = require('fs') | ||
| const { createRequire } = require('node:module') | ||
| require = createRequire(__filename) | ||
| const { readFileSync } = require('node:fs') | ||
| let nativeBinding = null | ||
@@ -364,9 +367,3 @@ const loadErrors = [] | ||
| module.exports.cut = nativeBinding.cut | ||
| module.exports.cutAll = nativeBinding.cutAll | ||
| module.exports.cutForSearch = nativeBinding.cutForSearch | ||
| module.exports.extract = nativeBinding.extract | ||
| module.exports.load = nativeBinding.load | ||
| module.exports.loadDict = nativeBinding.loadDict | ||
| module.exports.loadTFIDFDict = nativeBinding.loadTFIDFDict | ||
| module.exports.tag = nativeBinding.tag | ||
| module.exports.Jieba = nativeBinding.Jieba | ||
| module.exports.TfIdf = nativeBinding.TfIdf |
+27
-21
| { | ||
| "name": "@node-rs/jieba", | ||
| "version": "1.10.4", | ||
| "version": "2.0.1", | ||
| "description": "Fastest Chinese word segmentation in Node.js", | ||
@@ -23,3 +23,7 @@ "keywords": [ | ||
| "browser.js", | ||
| "LICENSE" | ||
| "LICENSE", | ||
| "dict.txt", | ||
| "idf.txt", | ||
| "dict.js", | ||
| "dict.d.ts" | ||
| ], | ||
@@ -58,7 +62,7 @@ "napi": { | ||
| "artifacts": "napi artifacts -d ../../artifacts", | ||
| "bench": "cross-env NODE_ENV=production node benchmark/jieba.js", | ||
| "bench": "cross-env NODE_ENV=production node --import @oxc-node/core/register benchmark/jieba.ts", | ||
| "build": "napi build --platform --release", | ||
| "build:debug": "napi build --platform", | ||
| "prepublishOnly": "napi prepublish", | ||
| "version": "napi version && git add npm" | ||
| "version": "napi version" | ||
| }, | ||
@@ -69,4 +73,6 @@ "bugs": { | ||
| "devDependencies": { | ||
| "@napi-rs/cli": "^3.0.0-alpha.63", | ||
| "nodejieba": "^3.0.0" | ||
| "@napi-rs/cli": "^3.0.0-alpha.64", | ||
| "cross-env": "^7.0.3", | ||
| "nodejieba": "^3.0.0", | ||
| "tinybench": "^3.0.0" | ||
| }, | ||
@@ -77,19 +83,19 @@ "funding": { | ||
| }, | ||
| "gitHead": "59fddf62f9c0eaa21443a540cbd6d900d0eb2672", | ||
| "gitHead": "83ca124b3e3968c945708136d6ff68d0e0549582", | ||
| "optionalDependencies": { | ||
| "@node-rs/jieba-darwin-x64": "1.10.4", | ||
| "@node-rs/jieba-darwin-arm64": "1.10.4", | ||
| "@node-rs/jieba-win32-x64-msvc": "1.10.4", | ||
| "@node-rs/jieba-linux-x64-gnu": "1.10.4", | ||
| "@node-rs/jieba-android-arm64": "1.10.4", | ||
| "@node-rs/jieba-linux-arm64-gnu": "1.10.4", | ||
| "@node-rs/jieba-linux-arm64-musl": "1.10.4", | ||
| "@node-rs/jieba-win32-arm64-msvc": "1.10.4", | ||
| "@node-rs/jieba-linux-arm-gnueabihf": "1.10.4", | ||
| "@node-rs/jieba-linux-x64-musl": "1.10.4", | ||
| "@node-rs/jieba-freebsd-x64": "1.10.4", | ||
| "@node-rs/jieba-win32-ia32-msvc": "1.10.4", | ||
| "@node-rs/jieba-android-arm-eabi": "1.10.4", | ||
| "@node-rs/jieba-wasm32-wasi": "1.10.4" | ||
| "@node-rs/jieba-darwin-x64": "2.0.1", | ||
| "@node-rs/jieba-darwin-arm64": "2.0.1", | ||
| "@node-rs/jieba-win32-x64-msvc": "2.0.1", | ||
| "@node-rs/jieba-linux-x64-gnu": "2.0.1", | ||
| "@node-rs/jieba-android-arm64": "2.0.1", | ||
| "@node-rs/jieba-linux-arm64-gnu": "2.0.1", | ||
| "@node-rs/jieba-linux-arm64-musl": "2.0.1", | ||
| "@node-rs/jieba-win32-arm64-msvc": "2.0.1", | ||
| "@node-rs/jieba-linux-arm-gnueabihf": "2.0.1", | ||
| "@node-rs/jieba-linux-x64-musl": "2.0.1", | ||
| "@node-rs/jieba-freebsd-x64": "2.0.1", | ||
| "@node-rs/jieba-win32-ia32-msvc": "2.0.1", | ||
| "@node-rs/jieba-android-arm-eabi": "2.0.1", | ||
| "@node-rs/jieba-wasm32-wasi": "2.0.1" | ||
| } | ||
| } |
+42
-45
@@ -17,47 +17,42 @@ # `@node-rs/jieba` | ||
| ```bash | ||
| @node-rs/jieba x 3,763 ops/sec ±1.18% (92 runs sampled) | ||
| nodejieba x 2,783 ops/sec ±0.67% (91 runs sampled) | ||
| Cut 1184 words bench suite: Fastest is @node-rs/jieba | ||
| @node-rs/jieba x 16.10 ops/sec ±1.58% (44 runs sampled) | ||
| nodejieba x 9.81 ops/sec ±2.39% (29 runs sampled) | ||
| Cut 246568 words bench suite: Fastest is @node-rs/jieba | ||
| @node-rs/jieba x 1,739 ops/sec ±0.87% (92 runs sampled) | ||
| nodejieba x 931 ops/sec ±1.31% (89 runs sampled) | ||
| Tag 1184 words bench suite: Fastest is @node-rs/jieba | ||
| @node-rs/jieba x 6.19 ops/sec ±2.01% (20 runs sampled) | ||
| nodejieba x 3.06 ops/sec ±5.39% (12 runs sampled) | ||
| Tag 246568 words bench suite: Fastest is @node-rs/jieba | ||
| Benchmark Cut 1184 words result | ||
| ┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐ | ||
| │ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │ | ||
| ├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤ | ||
| │ 0 │ '@node-rs/jieba' │ '8,246' │ 121266.9342871014 │ '±0.17%' │ 4124 │ | ||
| │ 1 │ 'nodejieba' │ '6,392' │ 156439.52799499547 │ '±0.20%' │ 3197 │ | ||
| └─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘ | ||
| Benchmark Cut 246568 words result | ||
| ┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐ | ||
| │ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │ | ||
| ├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤ | ||
| │ 0 │ '@node-rs/jieba' │ '32' │ 30760703.470588237 │ '±3.01%' │ 17 │ | ||
| │ 1 │ 'nodejieba' │ '19' │ 51275112.699999996 │ '±2.68%' │ 10 │ | ||
| └─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘ | ||
| Benchmark Tag 1184 words result | ||
| ┌─────────┬──────────────────┬─────────┬───────────────────┬──────────┬─────────┐ | ||
| │ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │ | ||
| ├─────────┼──────────────────┼─────────┼───────────────────┼──────────┼─────────┤ | ||
| │ 0 │ '@node-rs/jieba' │ '3,174' │ 315048.8916876547 │ '±0.20%' │ 1588 │ | ||
| │ 1 │ 'nodejieba' │ '2,672' │ 374213.8870605615 │ '±0.23%' │ 1337 │ | ||
| └─────────┴──────────────────┴─────────┴───────────────────┴──────────┴─────────┘ | ||
| Benchmark Tag 246568 words result | ||
| ┌─────────┬──────────────────┬─────────┬────────────────────┬──────────┬─────────┐ | ||
| │ (index) │ Task Name │ ops/sec │ Average Time (ns) │ Margin │ Samples │ | ||
| ├─────────┼──────────────────┼─────────┼────────────────────┼──────────┼─────────┤ | ||
| │ 0 │ '@node-rs/jieba' │ '11' │ 84886341.7999999 │ '±5.74%' │ 10 │ | ||
| │ 1 │ 'nodejieba' │ '7' │ 125781083.30000004 │ '±4.75%' │ 10 │ | ||
| └─────────┴──────────────────┴─────────┴────────────────────┴──────────┴─────────┘ | ||
| ``` | ||
| ## Support matrix | ||
| | | node12 | node14 | node16 | node18 | | ||
| | ---------------- | ------ | ------ | ------ | ------ | | ||
| | Windows x64 | ✓ | ✓ | ✓ | ✓ | | ||
| | Windows x32 | ✓ | ✓ | ✓ | ✓ | | ||
| | Windows arm64 | ✓ | ✓ | ✓ | ✓ | | ||
| | macOS x64 | ✓ | ✓ | ✓ | ✓ | | ||
| | macOS arm64 | ✓ | ✓ | ✓ | ✓ | | ||
| | Linux x64 gnu | ✓ | ✓ | ✓ | ✓ | | ||
| | Linux x64 musl | ✓ | ✓ | ✓ | ✓ | | ||
| | Linux arm gnu | ✓ | ✓ | ✓ | ✓ | | ||
| | Linux arm64 gnu | ✓ | ✓ | ✓ | ✓ | | ||
| | Linux arm64 musl | ✓ | ✓ | ✓ | ✓ | | ||
| | Android arm64 | ✓ | ✓ | ✓ | ✓ | | ||
| | Android armv7 | ✓ | ✓ | ✓ | ✓ | | ||
| | FreeBSD x64 | ✓ | ✓ | ✓ | ✓ | | ||
| ## Usage | ||
| ```javascript | ||
| const { load, cut } = require('@node-rs/jieba') | ||
| import { Jieba } from '@node-rs/jieba' | ||
| import { dict } from '@node-rs/jieba/dict' | ||
| load() | ||
| // loadDict(fs.readFileSync(...)) | ||
| // loadTFIDFDict(fs.readFileSync(...)) | ||
| // load jieba with the default dict | ||
| const jieba = Jieba.withDict(dict) | ||
| cut('我们中出了一个叛徒', false) | ||
| console.info(jieba.cut('我们中出了一个叛徒', false)) | ||
@@ -68,7 +63,10 @@ // ["我们", "中", "出", "了", "一个", "叛徒"] | ||
| ```javascript | ||
| const { load, cut } = require('@node-rs/jieba') | ||
| import { Jieba, TfIdf } from '@node-rs/jieba' | ||
| import { dict, idf } from '@node-rs/jieba/dict' | ||
| load() | ||
| const jieba = Jieba.withDict(dict) | ||
| const tfIdf = TfIdf.withDict(idf) | ||
| extract( | ||
| tfIdf.extractKeywords( | ||
| jieba, | ||
| '今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃', | ||
@@ -88,11 +86,10 @@ 3, | ||
| ```javascript | ||
| const { loadDict, cut } = require('@node-rs/jieba') | ||
| import { Jieba } from '@node-rs/jieba' | ||
| const customDict = ['哪行 50', '干一行 51', '行一行 52', '行行 53'] | ||
| const dictBuffer = Buffer.from(customDict.join('\n'), 'utf-8') | ||
| // loadDict doc: https://github.com/fxsjy/jieba?tab=readme-ov-file#%E8%BD%BD%E5%85%A5%E8%AF%8D%E5%85%B8 | ||
| loadDict(dictBuffer) | ||
| const jieba = Jieba.withDict(dictBuffer) | ||
| const text = '人要是行干一行行一行,一行行行行行,行行行干哪行都行' | ||
| const output = cut(text, false) | ||
| const output = jieba.cut(text, false) | ||
| console.log('分词结果⤵️\n', output.join('/')) | ||
@@ -99,0 +96,0 @@ // Before: 人/要是/行/干/一行行/一行/,/一行行/行/行/行/,/行/行/行/干/哪/行/都行 |
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
11296499
63743.67%10
66.67%478
33.52%4
100%97
-3%5
25%