Comparing version 1.0.11 to 2.0.0-rc.1
@@ -5,3 +5,3 @@ const languageSelect = document.getElementById('languages') | ||
function updateSentence() { | ||
function updateSentence () { | ||
const language = languageSelect.value | ||
@@ -8,0 +8,0 @@ const oldString = sentenceInput.value.split(' ') |
{ | ||
"name": "stopword", | ||
"version": "1.0.11", | ||
"version": "2.0.0-rc.1", | ||
"description": "A module for node.js and the browser that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 57 languages and also takes lists with custom stopwords as input.", | ||
"main": "lib/stopword.js", | ||
"main": "./dist/stopword.cjs.js", | ||
"module": "./dist/stopword.esm.js", | ||
"browser": "./dist/stopword.umd.js", | ||
"scripts": { | ||
"compile-for-browser": "webpack", | ||
"empty-sandbox": "rm -rf test/sandbox && mkdir test/sandbox", | ||
"test": "standard './*.js' './test/*.js' && npm run empty-sandbox && npm run compile-for-browser && tape ./test/test.js && cat test/sandbox/bundle.js | tape-run && rm -rf test/sandbox" | ||
"build": "rollup --config", | ||
"test": "standard './*.js' './test/*.js' && npm run build && npx ava ./test/test.cjs.js && npx ava ./test/test.esm.mjs" | ||
}, | ||
@@ -18,6 +19,4 @@ "repository": { | ||
"stopwords", | ||
"document", | ||
"processing", | ||
"document-processing", | ||
"search", | ||
"norch", | ||
"search-index", | ||
@@ -27,11 +26,5 @@ "nlp" | ||
"devDependencies": { | ||
"buffer": "6.0.3", | ||
"path-browserify": "1.0.1", | ||
"process": "0.11.10", | ||
"standard": "^16.0.3", | ||
"stream-browserify": "3.0.0", | ||
"tape": "^5.3.0", | ||
"tape-run": "^9.0.0", | ||
"webpack": "^5.47.0", | ||
"webpack-cli": "^4.7.2" | ||
"batr": "^2.0.2", | ||
"rollup-plugin-terser": "7.0.2", | ||
"rollup-plugin-license": "2.6.1" | ||
}, | ||
@@ -38,0 +31,0 @@ "author": "Fergus McDowall", |
202
README.md
@@ -7,6 +7,8 @@ # stopword | ||
## Breaking change! | ||
Language codes are changed from ISO-639-1 (two characters) to ISO-639-3. This to have room for more small languages that wasn't specified in ISO-639-1. | ||
[![NPM version][npm-version-image]][npm-url] | ||
[![NPM downloads][npm-downloads-image]][npm-url] | ||
[![Build Status][travis-image]][travis-url] | ||
[![Known Vulnerabilities][snyk-image]][snyk-url] | ||
[![Build Status][CI-image]][CI-url] | ||
[![JavaScript Style Guide][standardjs-image]][standardjs-url] | ||
@@ -19,16 +21,36 @@ [![MIT License][license-image]][license-url] | ||
## Usage | ||
## Getting the script in your environment | ||
### Node.js | ||
### CJS - CommonJS | ||
Deconstruction require: | ||
```javascript | ||
sw = require('stopword') | ||
// sw.removeStopwords and sw.[language code] now available | ||
const { removeStopwords, eng, fra } = require('stopword') | ||
// 'removeStopwords', 'eng' and 'fra' available | ||
``` | ||
### Script tag method | ||
Old style require: | ||
```javascript | ||
const sw = require('stopword') | ||
// sw.removeStopwords and sw.<language codes> now available | ||
``` | ||
### ESM - Ecmascript Modules | ||
Deconstruction import: | ||
```javascript | ||
import { removeStopwords, eng, fra } from './dist/stopword.esm.mjs' | ||
// 'removeStopwords', 'eng' and 'fra' available | ||
``` | ||
Old style import: | ||
```javascript | ||
import * as sw from './dist/stopword.esm.mjs' | ||
// 'sw.removeStopwords' + 'sw.<language codes>' available | ||
``` | ||
### UMD - Script tag method | ||
```html | ||
<script src="stopword.js"></script> | ||
<script src="stopword.umd.js"></script> | ||
<script> | ||
// sw.removeStopwords and sw.[language code] now available | ||
// sw.removeStopwords and sw.<language codes> now available | ||
</script> | ||
@@ -38,2 +60,4 @@ ``` | ||
## Usage | ||
### Default (English) | ||
@@ -43,5 +67,5 @@ By default, `stopword` will strip an array of "meaningless" English words | ||
```javascript | ||
sw = require('stopword') | ||
const { removeStopwords } = require('stopword') | ||
const oldString = 'a really Interesting string with some words'.split(' ') | ||
const newString = sw.removeStopwords(oldString) | ||
const newString = removeStopwords(oldString) | ||
// newString is now [ 'really', 'Interesting', 'string', 'words' ] | ||
@@ -54,6 +78,6 @@ | ||
```javascript | ||
sw = require('stopword') | ||
const { removeStopwords, swe } = require('stopword') | ||
const oldString = 'Trädgårdsägare är beredda att pröva vad som helst för att bli av med de hatade mördarsniglarna åäö'.split(' ') | ||
// sw.sv contains swedish stopwords | ||
const newString = sw.removeStopwords(oldString, sw.sv) | ||
// swe contains swedish stopwords | ||
const newString = removeStopwords(oldString, swe) | ||
// newString is now [ 'Trädgårdsägare', 'beredda', 'pröva', 'helst', 'hatade', 'mördarsniglarna', 'åäö' ] | ||
@@ -65,6 +89,6 @@ ``` | ||
```javascript | ||
sw = require('stopword') | ||
const { removeStopwords } = require('stopword') | ||
const oldString = 'you can even roll your own custom stopword list'.split(' ') | ||
// Just add your own list/array of stopwords | ||
const newString = sw.removeStopwords(oldString, [ 'even', 'a', 'custom', 'stopword', 'list', 'is', 'possible'] | ||
const newString = removeStopwords(oldString, [ 'even', 'a', 'custom', 'stopword', 'list', 'is', 'possible'] | ||
// newString is now [ 'you', 'can', 'roll', 'your', 'own'] | ||
@@ -76,6 +100,6 @@ ``` | ||
```javascript | ||
sw = require('stopword') | ||
const { removeStopwords, eng, swe } = require('stopword') | ||
const oldString = 'a really interesting string with some words trädgårdsägare är beredda att pröva vad som helst för att bli av med de hatade mördarsniglarna'.split(' ') | ||
const customStopwords = ['interesting', 'really'] | ||
const newString = sw.removeStopwords(oldString, [...sw.en, ...sw.sv, ...customStopwords] | ||
const newString = sw.removeStopwords(oldString, [...eng, ...swe, ...customStopwords] | ||
// newString is now ['string', 'words', 'trädgårdsägare', 'beredda', 'pröva', 'helst', 'hatade', 'mördarsniglarna'] | ||
@@ -96,4 +120,4 @@ ``` | ||
```javascript | ||
sw = require('stopword') | ||
var text = sw.removeStopwords(text[, stopwords]) | ||
const { removeStopwords } = require('stopword') | ||
var text = removeStopwords(text[, stopwords]) | ||
// text is now an array of given words minus specified stopwords | ||
@@ -104,71 +128,71 @@ ``` | ||
Arrays of stopwords for the following 55 languages are supplied: | ||
Language codes follow [ISO 639-3 Language Code list](https://iso639-3.sil.org/code_tables/639/data/all). Arrays of stopwords for the following 57 languages are supplied: | ||
* `af` - Afrikaans | ||
* `ar` - Arabic, Modern Standard | ||
* `hy` - Armenian | ||
* `eu` - Basque | ||
* `bn` - Bengali | ||
* `br` - Breton | ||
* `bg` - Bulgarian | ||
* `ca` - Catalan | ||
* `zh` - Chinese Simplified | ||
* `hr` - Croatian | ||
* `cs` - Czech | ||
* `da` - Danish | ||
* `nl` - Dutch | ||
* `en` - English | ||
* `eo` - Esperanto | ||
* `et` - Estonian | ||
* `fa` - Farsi | ||
* `fi` - Finnish | ||
* `fr` - French | ||
* `gl` - Galician | ||
* `de` - German | ||
* `el` - Greek | ||
* `ha` - Hausa | ||
* `he` - Hebrew | ||
* `hi` - Hindi | ||
* `hu` - Hungarian | ||
* `id` - Indonesian | ||
* `ga` - Irish | ||
* `it` - Italian | ||
* `ja` - Japanese | ||
* `ko` - Korean | ||
* `la` - Latin | ||
* `lv` - Latvian | ||
* `lgg` - Lugbara (without diacritics) | ||
* `lggo` - Lugbara official (with diacritics) | ||
* `mr` - Marathi | ||
* `my` - Myanmar | ||
* `no` - Norwegian | ||
* `pl` - Polish | ||
* `pt` - Portuguese | ||
* `ptbr` - Portuguese (Brazilian) | ||
* `pa` - Punjabi Gurmukhi | ||
* `ro` - Romanian | ||
* `ru` - Russian | ||
* `sk` - Slovak | ||
* `sl` - Slovenian | ||
* `so` - Somali | ||
* `st` - Sotho | ||
* `es` - Spanish | ||
* `sw` - Swahili | ||
* `sv` - Swedish | ||
* `th` - Thai | ||
* `tl` - Tagalog (Filipino) | ||
* `tr` - Turkish | ||
* `ur` - Urdu | ||
* `vi` - Vietnamese | ||
* `yo` - Yoruba | ||
* `zu` - Zulu | ||
* `afr` - Afrikaans | ||
* `ara` - Arabic, Macrolanguage | ||
* `hye` - Armenian | ||
* `eus` - Basque | ||
* `ben` - Bengali | ||
* `bre` - Breton | ||
* `bul` - Bulgarian | ||
* `cat` - Catalan, Valencian | ||
* `zho` - Chinese, Macrolanguage | ||
* `hrv` - Croatian | ||
* `ces` - Czech | ||
* `dan` - Danish | ||
* `nld` - Dutch | ||
* `eng` - English | ||
* `epo` - Esperanto | ||
* `est` - Estonian, Macrolanguage | ||
* `fin` - Finnish | ||
* `fra` - French | ||
* `glg` - Galician | ||
* `deu` - German | ||
* `ell` - Greek, Modern | ||
* `hau` - Hausa | ||
* `heb` - Hebrew | ||
* `hin` - Hindi | ||
* `hun` - Hungarian | ||
* `ind` - Indonesian | ||
* `gle` - Irish | ||
* `ita` - Italian | ||
* `jpn` - Japanese | ||
* `kor` - Korean | ||
* `lat` - Latin | ||
* `lav` - Latvian, Macrolanguage | ||
* `lgg` - Lugbara | ||
* `lggNd` - Lugbara, No diacritics | ||
* `mar` - Marathi | ||
* `mya` - Myanmar (Burmese) | ||
* `nob` - Norwegian bokmål | ||
* `fas` - Persian (Farsi) | ||
* `pol` - Polish | ||
* `por` - Portuguese | ||
* `porBr` - Portuguese-Brazilian | ||
* `panGu` - Punjabi (Panjabi), Gurmukhi script | ||
* `ron` - Romanian (Moldavian, Moldovan) | ||
* `rus` - Russian | ||
* `slk` - Slovak | ||
* `slv` - Slovenian | ||
* `som` - Somali | ||
* `sot` - Sotho, Southern | ||
* `spa` - Spanish | ||
* `swa` - Swahili, Macrolanguage | ||
* `swe` - Swedish | ||
* `tha` - Thai | ||
* `tgl` - Tagalog (Filipino) | ||
* `tur` - Turkish | ||
* `urd` - Urdu | ||
* `vie` - Vietnamese | ||
* `yor` - Yoruba | ||
* `zul` - Zulu | ||
```javascript | ||
sw = require('stopword') | ||
norwegianStopwords = sw.no | ||
// norwegianStopwords now contains an Array of norwgian stopwords | ||
const { nob } = require('stopword') | ||
norwegianBokmaalStopwords = nob | ||
// norwegianBokmaalStopwords now contains an Array of norwgian bokmål stopwords | ||
``` | ||
#### Languages with no space between words | ||
`ja` Japanese, `th` Thai and `zh` Chinese Simplified and some of the other languages supported have no space between words. For these languages you need to split the text into an array of words in another way than just `textString.split(' ')`. You can check out [TinySegmenter](http://chasen.org/%7Etaku/software/TinySegmenter/) for Japanese and [chinese-tokenizer](https://github.com/yishn/chinese-tokenizer) for Chinese. | ||
`jpn` Japanese, `tha` Thai and `zho` Chinese and some of the other languages supported have no space between words. For these languages you need to split the text into an array of words in another way than just `textString.split(' ')`. You can check out [TinySegmenter](http://chasen.org/%7Etaku/software/TinySegmenter/) for Japanese and [chinese-tokenizer](https://github.com/yishn/chinese-tokenizer) for Chinese. | ||
@@ -178,5 +202,7 @@ ## Your language missing? | ||
## Contributions | ||
Most of this work is from other projects and people, and wouldn't be possible without them. Thanks to among others the [stopwords-iso](https://github.com/stopwords-iso) project and the [more-stoplist](https://github.com/dohliam/more-stoplists) project. And thanks for all your code input: @arthurdenner, @micalevisk, @fabric-io-rodrigues, @behzadmoradi, @guysaar223, @ConnorKrammer, @GreXLin85, @nanopx and @virtual! | ||
## Contributions and licenses | ||
Most of this work is from other projects and people, and wouldn't be possible without them. Thanks to among others the [stopwords-iso](https://github.com/stopwords-iso) project and the [more-stoplist](https://github.com/dohliam/more-stoplists) project. And thanks for all your code input: @arthurdenner, @micalevisk, @fabric-io-rodrigues, @behzadmoradi, @guysaar223, @ConnorKrammer, @GreXLin85, @nanopx, @virtual and @JustroX! | ||
[Licenses](./dist/LICENSES.txt) for both this and all third party code. | ||
[license-image]: http://img.shields.io/badge/license-MIT-blue.svg?style=flat | ||
@@ -187,7 +213,5 @@ [license-url]: LICENSE | ||
[npm-downloads-image]: http://img.shields.io/npm/dm/stopword.svg?style=flat | ||
[travis-url]: http://travis-ci.org/fergiemcdowall/stopword | ||
[travis-image]: http://img.shields.io/travis/fergiemcdowall/stopword.svg?style=flat | ||
[snyk-url]: https://snyk.io/test/github/fergiemcdowall/stopword?targetFile=package.json | ||
[snyk-image]: https://snyk.io/test/github/fergiemcdowall/stopword/badge.svg?targetFile=package.json | ||
[CI-url]: https://github.com/fergiemcdowall/stopword/actions/workflows/tests.yml | ||
[CI-image]: https://github.com/fergiemcdowall/stopword/actions/workflows/tests.yml/badge.svg | ||
[standardjs-url]: https://standardjs.com | ||
[standardjs-image]: https://img.shields.io/badge/code_style-standard-brightgreen.svg?style=flat-square |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
Unidentified License
License(Experimental) Something that seems like a license was found, but its contents could not be matched with a known license.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
1360419
3
79
6320
207
2
70
1